library(swimplot) library(grid) library(gtable) library(readr)
library(mosaic) library(dplyr) library(survival) library(survminer)
library(ggplot2) library(scales) library(coxphf) library(ggthemes)
library(tidyverse) library(gtsummary) library(flextable)
library(parameters) library(car) library(ComplexHeatmap)
library(tidyverse) library(readxl) library(janitor) library(DT)
library(pROC) library(rms)
#ctDNA Detection Rates by Window and Stages
#ctDNA at MRD by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
circ_data <- subset(circ_data, ctDNA.MRD %in% c("NEGATIVE", "POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.MRD == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.MRD, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
Stage = total_counts_by_stage$Group.1,
Total_Count = total_counts_by_stage$x,
Positive_Count = positive_counts_by_stage$x,
Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100 # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.MRD == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100 # Convert to percentage
overall_row <- data.frame(
Stage = "Overall",
Total_Count = overall_total_count,
Positive_Count = overall_positive_count,
Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)
Stage Total_Count Positive_Count Rate
1 0/I 6 2 33.33%
2 II 6 4 66.67%
3 III 16 9 56.25%
4 Overall 28 15 53.57%
#ctDNA at MRD by NAC status
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
circ_data <- subset(circ_data, ctDNA.MRD %in% c("NEGATIVE", "POSITIVE"))
circ_data$NAC <- factor(circ_data$NAC, levels=c("FALSE","TRUE"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.MRD == "POSITIVE", by=list(circ_data$NAC), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.MRD, by=list(circ_data$NAC), FUN=length)
combined_data <- data.frame(
Stage = total_counts_by_stage$Group.1,
Total_Count = total_counts_by_stage$x,
Positive_Count = positive_counts_by_stage$x,
Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100 # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.MRD == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100 # Convert to percentage
overall_row <- data.frame(
Stage = "Overall",
Total_Count = overall_total_count,
Positive_Count = overall_positive_count,
Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)
Stage Total_Count Positive_Count Rate
1 FALSE 23 12 52.17%
2 TRUE 5 3 60.00%
3 Overall 28 15 53.57%
#ctDNA at Surveillance by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.Surveillance == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.Surveillance, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
Stage = total_counts_by_stage$Group.1,
Total_Count = total_counts_by_stage$x,
Positive_Count = positive_counts_by_stage$x,
Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100 # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.Surveillance == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100 # Convert to percentage
overall_row <- data.frame(
Stage = "Overall",
Total_Count = overall_total_count,
Positive_Count = overall_positive_count,
Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)
Stage Total_Count Positive_Count Rate
1 0/I 9 3 33.33%
2 II 9 7 77.78%
3 III 22 12 54.55%
4 Overall 40 22 55.00%
#ctDNA at anytime by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.anytime!="",]
circ_data$ctDNA.anytime <- factor(circ_data$ctDNA.anytime, levels=c("NEGATIVE","POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.anytime == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.anytime, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
Stage = total_counts_by_stage$Group.1,
Total_Count = total_counts_by_stage$x,
Positive_Count = positive_counts_by_stage$x,
Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100 # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.anytime == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100 # Convert to percentage
overall_row <- data.frame(
Stage = "Overall",
Total_Count = overall_total_count,
Positive_Count = overall_positive_count,
Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)
Stage Total_Count Positive_Count Rate
1 0/I 9 3 33.33%
2 II 10 8 80.00%
3 III 28 22 78.57%
4 Overall 47 33 70.21%
#Demographics Table
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data_subset <- circ_data %>%
select(
Age,
Sex,
Stage,
NAC,
ACT,
ACT.type,
RFS.Event,
OS.Event,
OS.months) %>%
mutate(
Age = as.numeric(Age),
Sex = factor(Sex),
Stage = factor(Stage),
NAC = factor(NAC),
ACT = factor(ACT),
ACT.type = factor(ACT.type),
RFS.Event = factor(RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence")),
OS.Event = factor(OS.Event, levels = c("FALSE", "TRUE"), labels = c("Alive", "Deceased")),
OS.months = as.numeric(OS.months))
table1 <- circ_data_subset %>%
tbl_summary(
statistic = list(
all_continuous() ~ "{median} ({min} - {max})",
all_categorical() ~ "{n} ({p}%)")) %>%
bold_labels()
table1
| Characteristic |
N = 47 |
| Age |
77 (47 - 92) |
| Sex |
|
| Female |
17 (36%) |
| Male |
30 (64%) |
| Stage |
|
| 0/I |
9 (19%) |
| II |
10 (21%) |
| III |
28 (60%) |
| NAC |
|
| FALSE |
40 (85%) |
| TRUE |
7 (15%) |
| ACT |
|
| FALSE |
13 (28%) |
| TRUE |
34 (72%) |
| ACT.type |
|
| |
13 (28%) |
| Chemotherapy |
18 (38%) |
| Immunotherapy |
16 (34%) |
| RFS.Event |
|
| No Recurrence |
23 (49%) |
| Recurrence |
24 (51%) |
| OS.Event |
|
| Alive |
41 (87%) |
| Deceased |
6 (13%) |
| OS.months |
21 (3 - 48) |
fit1 <- as_flex_table(
table1,
include = everything(),
return_calls = FALSE)
fit1
Characteristic | N = 471 |
|---|
Age | 77 (47 - 92) |
Sex |
|
Female | 17 (36%) |
Male | 30 (64%) |
Stage |
|
0/I | 9 (19%) |
II | 10 (21%) |
III | 28 (60%) |
NAC |
|
FALSE | 40 (85%) |
TRUE | 7 (15%) |
ACT |
|
FALSE | 13 (28%) |
TRUE | 34 (72%) |
ACT.type |
|
| 13 (28%) |
Chemotherapy | 18 (38%) |
Immunotherapy | 16 (34%) |
RFS.Event |
|
No Recurrence | 23 (49%) |
Recurrence | 24 (51%) |
OS.Event |
|
Alive | 41 (87%) |
Deceased | 6 (13%) |
OS.months | 21 (3 - 48) |
1Median (Min - Max); n (%) |
save_as_docx(fit1, path= "~/Downloads/1. Demographics Table_RWE UTUC.docx")
#Overview plot
rm(list=ls())
setwd("~/Downloads")
clinstage<- read.csv("RWE UTUC_OP 102025.csv")
clinstage <- clinstage[clinstage$Final.cohort=="TRUE",]
clinstage_df<- as.data.frame(clinstage)
#Display the swimmer plot with the label box
oplot<-swimmer_plot(df=clinstage_df,
id='PatientName',
end='fu.diff.months',
fill='gray',
width=.01,)
oplot <- oplot + theme(panel.border = element_blank())
oplot <- oplot + scale_y_continuous(breaks = seq(0, 96, by = 3))
oplot <- oplot + labs(x ="Patients" , y="Time from Surgery (Months)")
oplot

##plot events
oplot_ev1 <- oplot + swimmer_points(df_points=clinstage_df,
id='PatientName',
time='date.diff.months',
name_shape ='Event_type',
name_col = 'Event',
size=3.5,fill='black',
#col='darkgreen'
)
oplot_ev1

#Shape customization to Event_type
oplot_ev1.1 <- oplot_ev1 + ggplot2::scale_shape_manual(name="Event_type",values=c(1,16,6,18,4),breaks=c('ctDNA_neg','ctDNA_pos','Imaging','Surgery','Death'))
oplot_ev1.1

#plot treatment
oplot_ev2 <- oplot_ev1.1 + swimmer_lines(df_lines=clinstage_df,
id='PatientName',
start='Tx_start.months',
end='Tx_end.months',
name_col='Tx_type',
size=3.5,
name_alpha = 1.0)
oplot_ev2 <- oplot_ev2 + guides(linetype = guide_legend(override.aes = list(size = 5, color = "black")))
oplot_ev2

#colour customization
oplot_ev2.2 <- oplot_ev2 + ggplot2::scale_color_manual(name="Event",values=c( "lightblue","purple","green", "black", "black", "black", "lightblue","lightblue","green","red","green","green","blue","lightblue", "blue", "blue"))
oplot_ev2.2

#RFS by ctDNA status at MRD Window
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$RFS.MRD <- circ_data$RFS - circ_data$ctDNA.timing
circ_data$RFS.MRD.months <- circ_data$RFS.MRD / 30.437
survfit(Surv(time = circ_data$RFS.MRD.months, event = circ_data$RFS.Event)~ctDNA.MRD, data = circ_data)
Call: survfit(formula = Surv(time = circ_data$RFS.MRD.months, event = circ_data$RFS.Event) ~
ctDNA.MRD, data = circ_data)
n events median 0.95LCL 0.95UCL
ctDNA.MRD=NEGATIVE 13 3 NA 12.62 NA
ctDNA.MRD=POSITIVE 15 11 9.07 6.77 NA
event_summary <- circ_data %>%
group_by(ctDNA.MRD) %>%
summarise(
Total = n(),
Events = sum(RFS.Event),
Fraction = Events / n(),
Percentage = (Events / n()) * 100
)
print(event_summary)
# A tibble: 2 × 5
ctDNA.MRD Total Events Fraction Percentage
<chr> <int> <int> <dbl> <dbl>
1 NEGATIVE 13 3 0.231 23.1
2 POSITIVE 15 11 0.733 73.3
surv_object <-Surv(time = circ_data$RFS.MRD.months, event = circ_data$RFS.Event)
KM_curve <- survfit(surv_object ~ ctDNA.MRD, data = circ_data,conf.int=0.95,conf.type="log-log")
ggsurvplot(KM_curve, data = circ_data, pval = FALSE, conf.int = FALSE, risk.table = TRUE, break.time.by=3, palette=c("blue","red"), title="RFS - ctDNA at the MRD Window", ylab= "Recurrence-Free Survival", xlab="Time from Landmark Time point (months)", legend.labs=c("ctDNA Negative", "ctDNA Positive"), legend.title="")


summary(KM_curve, times= c(0, 12, 18))
Call: survfit(formula = surv_object ~ ctDNA.MRD, data = circ_data,
conf.int = 0.95, conf.type = "log-log")
ctDNA.MRD=NEGATIVE
time n.risk n.event survival std.err lower 95% CI upper 95% CI
0 13 0 1.000 0.000 1.000 1.000
12 8 2 0.818 0.116 0.447 0.951
18 5 1 0.716 0.140 0.350 0.899
ctDNA.MRD=POSITIVE
time n.risk n.event survival std.err lower 95% CI upper 95% CI
0 15 0 1.000 0.000 1.000 1.000
12 6 8 0.467 0.129 0.212 0.687
18 3 2 0.311 0.124 0.102 0.550
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
cox_fit <- coxph(surv_object ~ ctDNA.MRD, data=circ_data)
ggforest(cox_fit,data = circ_data)

summary(cox_fit)
Call:
coxph(formula = surv_object ~ ctDNA.MRD, data = circ_data)
n= 28, number of events= 14
coef exp(coef) se(coef) z Pr(>|z|)
ctDNA.MRDPOSITIVE 1.4752 4.3720 0.6539 2.256 0.0241 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
ctDNA.MRDPOSITIVE 4.372 0.2287 1.214 15.75
Concordance= 0.677 (se = 0.057 )
Likelihood ratio test= 6.31 on 1 df, p=0.01
Wald test = 5.09 on 1 df, p=0.02
Score (logrank) test = 6.05 on 1 df, p=0.01
cox_fit_summary <- summary(cox_fit)
#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)
[1] "HR = 4.37 (1.21-15.75); p = 0.024"
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels = c("NEGATIVE", "POSITIVE"), labels = c("Negative", "Positive"))
circ_data$RFS.Event <- factor(circ_data$RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence"))
contingency_table <- table(circ_data$ctDNA.MRD, circ_data$RFS.Event)
chi_square_test <- chisq.test(contingency_table)
print(chi_square_test)
Pearson's Chi-squared test with Yates' continuity correction
data: contingency_table
X-squared = 5.1692, df = 1, p-value = 0.02299
fisher_exact_test <- fisher.test(contingency_table)
print(fisher_exact_test)
Fisher's Exact Test for Count Data
data: contingency_table
p-value = 0.0213
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.28780 74.44434
sample estimates:
odds ratio
8.328277
print(contingency_table)
No Recurrence Recurrence
Negative 10 3
Positive 4 11
table_df <- as.data.frame(contingency_table)
table_df$Total <- ave(table_df$Freq, table_df$Var1, FUN = sum)
table_df$Percentage <- table_df$Freq / table_df$Total
table_df$MiddlePercentage <- table_df$Percentage / 2
ggplot(table_df, aes(x = Var1, y = Percentage, fill = Var2)) +
geom_bar(stat = "identity") +
geom_text(aes(y = MiddlePercentage, label = Freq), position = "stack", color = "black", vjust = 1.5, size = 7) +
theme_minimal() +
labs(title = "ctDNA status at MRD",
x = "ctDNA",
y = "Patients (%)",
fill = "Recurrence",
caption = paste("Fisher's exact test p-value: ", format.pval(fisher_exact_test$p.value))) +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c("No Recurrence" = "blue", "Recurrence" = "red")) + # define custom colors
theme(axis.text.x = element_text(angle = 0, hjust = 1.5, size = 14), # increase x-axis text size
axis.text.y = element_text(size = 14, color = "black"), # increase y-axis text size
axis.title.x = element_text(size = 14, color = "black"), # increase x-axis label size
axis.title.y = element_text(size = 14, color = "black"), # increase y-axis label size
legend.text = element_text(size = 12, color = "black")) # increase Recurrence label size

#RFS by ctDNA status at Surveillance Window
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
survfit(Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event)~ctDNA.Surveillance, data = circ_data)
Call: survfit(formula = Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event) ~
ctDNA.Surveillance, data = circ_data)
n events median 0.95LCL 0.95UCL
ctDNA.Surveillance=NEGATIVE 18 3 NA NA NA
ctDNA.Surveillance=POSITIVE 22 17 13.9 10.1 NA
event_summary <- circ_data %>%
group_by(ctDNA.Surveillance) %>%
summarise(
Total = n(),
Events = sum(RFS.Event),
Fraction = Events / n(),
Percentage = (Events / n()) * 100
)
print(event_summary)
# A tibble: 2 × 5
ctDNA.Surveillance Total Events Fraction Percentage
<chr> <int> <int> <dbl> <dbl>
1 NEGATIVE 18 3 0.167 16.7
2 POSITIVE 22 17 0.773 77.3
surv_object <-Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event)
KM_curve <- survfit(surv_object ~ ctDNA.Surveillance, data = circ_data,conf.int=0.95,conf.type="log-log")
ggsurvplot(KM_curve, data = circ_data, pval = FALSE, conf.int = FALSE, risk.table = TRUE, break.time.by=6, palette=c("blue","red"), title="RFS - ctDNA at the Surveillance Window", ylab= "Recurrence-Free Survival", xlab="Time from Surgery (months)", legend.labs=c("ctDNA Negative", "ctDNA Positive"), legend.title="")


summary(KM_curve, times= c(0, 12, 24))
Call: survfit(formula = surv_object ~ ctDNA.Surveillance, data = circ_data,
conf.int = 0.95, conf.type = "log-log")
ctDNA.Surveillance=NEGATIVE
time n.risk n.event survival std.err lower 95% CI upper 95% CI
0 18 0 1.000 0.0000 1.000 1.000
12 15 1 0.938 0.0605 0.632 0.991
24 5 2 0.757 0.1277 0.401 0.919
ctDNA.Surveillance=POSITIVE
time n.risk n.event survival std.err lower 95% CI upper 95% CI
0 22 0 1.000 0.000 1.000 1.000
12 12 9 0.591 0.105 0.361 0.762
24 4 6 0.287 0.101 0.114 0.488
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"))
cox_fit <- coxph(surv_object ~ ctDNA.Surveillance, data=circ_data)
ggforest(cox_fit,data = circ_data)

summary(cox_fit)
Call:
coxph(formula = surv_object ~ ctDNA.Surveillance, data = circ_data)
n= 40, number of events= 20
coef exp(coef) se(coef) z Pr(>|z|)
ctDNA.SurveillancePOSITIVE 1.8586 6.4151 0.6287 2.956 0.00311 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
ctDNA.SurveillancePOSITIVE 6.415 0.1559 1.871 21.99
Concordance= 0.699 (se = 0.047 )
Likelihood ratio test= 12.46 on 1 df, p=4e-04
Wald test = 8.74 on 1 df, p=0.003
Score (logrank) test = 11.47 on 1 df, p=7e-04
cox_fit_summary <- summary(cox_fit)
#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)
[1] "HR = 6.42 (1.87-21.99); p = 0.003"
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels = c("NEGATIVE", "POSITIVE"), labels = c("Negative", "Positive"))
circ_data$RFS.Event <- factor(circ_data$RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence"))
contingency_table <- table(circ_data$ctDNA.Surveillance, circ_data$RFS.Event)
chi_square_test <- chisq.test(contingency_table)
print(chi_square_test)
Pearson's Chi-squared test with Yates' continuity correction
data: contingency_table
X-squared = 12.222, df = 1, p-value = 0.0004722
fisher_exact_test <- fisher.test(contingency_table)
print(fisher_exact_test)
Fisher's Exact Test for Count Data
data: contingency_table
p-value = 0.0003284
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
2.867735 118.989345
sample estimates:
odds ratio
15.45796
print(contingency_table)
No Recurrence Recurrence
Negative 15 3
Positive 5 17
table_df <- as.data.frame(contingency_table)
table_df$Total <- ave(table_df$Freq, table_df$Var1, FUN = sum)
table_df$Percentage <- table_df$Freq / table_df$Total
table_df$MiddlePercentage <- table_df$Percentage / 2
ggplot(table_df, aes(x = Var1, y = Percentage, fill = Var2)) +
geom_bar(stat = "identity") +
geom_text(aes(y = MiddlePercentage, label = Freq), position = "stack", color = "black", vjust = 1.5, size = 7) +
theme_minimal() +
labs(title = "ctDNA status at Surveillance",
x = "ctDNA",
y = "Patients (%)",
fill = "Recurrence",
caption = paste("Fisher's exact test p-value: ", format.pval(fisher_exact_test$p.value))) +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c("No Recurrence" = "blue", "Recurrence" = "red")) + # define custom colors
theme(axis.text.x = element_text(angle = 0, hjust = 1.5, size = 14), # increase x-axis text size
axis.text.y = element_text(size = 14, color = "black"), # increase y-axis text size
axis.title.x = element_text(size = 14, color = "black"), # increase x-axis label size
axis.title.y = element_text(size = 14, color = "black"), # increase y-axis label size
legend.text = element_text(size = 12, color = "black")) # increase Recurrence label size

#Time-dependent analysis for RFS in longitudinal time points
rm(list=ls())
setwd("~/Downloads")
dt <- read_xlsx("CLIA UTUC Clinical Data_Time dependent.xlsx") |>
clean_names() |>
mutate(across(
.cols = c(window_start_date, dfs_date, surveillance_1_date:surveillance_12_date),
.fns = ~ as_date(as.Date(.x, format = "%Y-%m-%d"))
)) |>
filter(final_cohort == TRUE)
dt_biomarker <- dt |>
select(pts_id, ct_dna_surveillance_available,
window_start_date,
surveillance_1_status:surveillance_12_date) |>
filter(ct_dna_surveillance_available) |>
pivot_longer(cols = surveillance_1_status:surveillance_12_date,
names_to = c("visit_number", ".value"),
names_pattern = "surveillance_(.)_(.*)") |>
mutate(biomarker_time = day(days(date - window_start_date))) |>
select(pts_id, biomarker_time, biomarker_status = status) |>
filter(!is.na(biomarker_time))
glimpse(dt_biomarker)
Rows: 137
Columns: 3
$ pts_id <chr> "UTUC-002", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-010", "UTUC-011", "UTUC-011", "UTUC-…
$ biomarker_time <dbl> 133, 144, 181, 223, 267, 307, 349, 392, 433, 475, 74, 164, 204, 257, 385, 107, 145, 190, 267, 411, 78, 133, 267, 274, 341, 433, 42, 75, 21, 64, 111, 1…
$ biomarker_status <chr> "POSITIVE", "NEGATIVE", "NEGATIVE", "NEGATIVE", "NEGATIVE", "NEGATIVE", "NEGATIVE", "POSITIVE", "POSITIVE", "POSITIVE", "NEGATIVE", "POSITIVE", "POSIT…
dt_survival <- dt |>
select(pts_id, ct_dna_surveillance_available,
window_start_date:dfs_date, dfs_event) |> # Added dfs_event here
filter(ct_dna_surveillance_available) |>
mutate(dfs_time = (dfs_date - window_start_date),
dfs_time = day(days(dfs_time)),
dfs_event = as.numeric(dfs_event)) |>
select(pts_id, dfs_time, dfs_event)
glimpse(dt_survival)
Rows: 38
Columns: 3
$ pts_id <chr> "UTUC-002", "UTUC-010", "UTUC-011", "UTUC-012", "UTUC-015", "UTUC-026", "UTUC-027", "UTUC-028", "UTUC-030", "UTUC-031", "UTUC-032", "UTUC-034", "UTUC-035", "…
$ dfs_time <dbl> 143, 707, 651, 636, 671, 90, 260, 13, 424, 74, 614, 697, 120, 314, 594, 210, 627, 23, 72, 1080, 773, 482, 201, 347, 893, 426, 324, 662, 182, 557, 77, 219, 61…
$ dfs_event <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0
aux <- dt_survival %>%
filter(dfs_time <= 0)
tab <- left_join(aux, dt) |>
select(pts_id, window_start_date, dfs_time, dfs_date,
surveillance_1_date:surveillance_12_date) |>
mutate(across(.cols = dfs_date:surveillance_12_date,
.fns = ~ as_date(.x))) |>
select(pts_id, window_start_date, dfs_date, dfs_time)
datatable(tab, filter = "top")
dt_survival <- dt_survival |>
filter(dfs_time > 0)
aux <- dt |>
select(pts_id, ct_dna_surveillance_available,
window_start_date, dfs_date,
surveillance_1_date:surveillance_12_date) |>
mutate(across(.cols = surveillance_1_date:surveillance_12_date,
.fns = ~ .x - window_start_date)) |>
mutate(across(.cols = surveillance_1_date:surveillance_12_date,
.fns = ~ .x < 0)) |>
rowwise() |>
mutate(sum_neg =
sum(c_across(surveillance_1_date:surveillance_12_date),
na.rm = TRUE)) |>
select(pts_id, sum_neg)
tab <- left_join(aux, dt) |>
filter(sum_neg > 0) |>
select(pts_id, sum_neg, window_start_date,
surveillance_1_date:surveillance_12_date) |>
mutate(across(.cols = window_start_date:surveillance_12_date,
.fns = ~ as_date(.x)))
datatable(tab, filter = "top")
aux <- dt |>
select(pts_id, ct_dna_surveillance_available,
window_start_date, dfs_date,
surveillance_1_date:surveillance_12_date) |>
mutate(across(.cols = dfs_date:surveillance_12_date,
.fns = ~ .x - window_start_date)) |>
mutate(across(.cols = surveillance_2_date:surveillance_12_date,
.fns = ~ dfs_date < .x)) |>
rowwise() |>
mutate(n_biomarker_after_event = sum(c_across(surveillance_2_date:
surveillance_12_date),
na.rm = TRUE)) |>
mutate(across(.cols = surveillance_1_date:surveillance_12_date,
.fns = ~ !is.na(.x))) |>
mutate(total_biomarker = sum(c_across(surveillance_2_date:
surveillance_12_date),
na.rm = TRUE)) |>
select(pts_id, n_biomarker_after_event, total_biomarker)
temp <- aux |>
select(-pts_id) |>
group_by(n_biomarker_after_event, total_biomarker) |> # Direct grouping
summarise(freq = n(), .groups = "drop") # Drop groups after summarization
tab <- left_join(aux, dt) |>
select(pts_id, n_biomarker_after_event, total_biomarker,
dfs_date,
surveillance_2_date:surveillance_12_date) |>
mutate(across(.cols = dfs_date:surveillance_12_date,
.fns = ~ as_date(.x))) |>
filter(n_biomarker_after_event > 0)
datatable(tab, filter = "top")
aux <- tmerge(data1 = dt_survival,
data2 = dt_survival,
id = pts_id,
dfs_event = event(dfs_time, dfs_event))
dt_final <- tmerge(data1 = aux,
data2 = dt_biomarker,
id = pts_id,
biomarker_status =
tdc(biomarker_time, biomarker_status))
datatable(dt_final, filter = "top")
# Syntax if there is not time-dependent covariate
# fit <- coxph(Surv(dfs_time, dfs_event) ~ biomarker_status,
# data = dt_final)
# summary(fit)
fit <- coxph(Surv(tstart, tstop, dfs_event) ~ biomarker_status,
data = dt_final)
summary(fit)
Call:
coxph(formula = Surv(tstart, tstop, dfs_event) ~ biomarker_status,
data = dt_final)
n= 127, number of events= 17
(36 observations deleted due to missingness)
coef exp(coef) se(coef) z Pr(>|z|)
biomarker_statusPOSITIVE 2.8209 16.7919 0.6524 4.324 1.53e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
biomarker_statusPOSITIVE 16.79 0.05955 4.675 60.32
Concordance= 0.824 (se = 0.055 )
Likelihood ratio test= 26.23 on 1 df, p=3e-07
Wald test = 18.7 on 1 df, p=2e-05
Score (logrank) test = 31 on 1 df, p=3e-08
cox_fit_summary <- summary(fit)
#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)
[1] "HR = 16.79 (4.67-60.32); p = 0"
#Median numbers of time points in the longitudinal setting
# Load the dataset
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
circ_datadf <- as.data.frame(circ_data)
median_Nsurvtps <- median(circ_datadf$Nsurvtps, na.rm = TRUE)
min_Nsurvtps <- min(circ_datadf$Nsurvtps, na.rm = TRUE)
max_Nsurvtps <- max(circ_datadf$Nsurvtps, na.rm = TRUE)
cat(sprintf("Median # of surveillance time points: %d (%d-%d)\n",
median_Nsurvtps, min_Nsurvtps, max_Nsurvtps))
Median # of surveillance time points: 3 (1-12)
#Median of median interval of ctDNA timepoints and radiological
imaging assessment
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("RWE UTUC_OP 102025.csv", stringsAsFactors = FALSE)
names(df) <- trimws(names(df))
# ---- Helper: compute median interval per patient ----
median_interval_per_patient <- function(data, filter_col, filter_val) {
data %>%
filter(!!sym(filter_col) == filter_val) %>%
arrange(PatientName, date.diff) %>%
group_by(PatientName) %>%
mutate(interval = date.diff - lag(date.diff)) %>%
summarise(median_interval = median(interval, na.rm = TRUE), .groups = "drop") %>%
mutate(event_group = filter_val)
}
# ---- Compute per-patient medians ----
ctDNA_patient_medians <- median_interval_per_patient(df, "Event", "ctDNA")
imaging_patient_medians <- median_interval_per_patient(df, "Event_type", "Imaging")
# ---- Function to summarize patient-level medians to cohort-level ----
cohort_summary <- function(patient_data, event_label) {
# Filter out patients with NA median intervals
valid_data <- patient_data %>% filter(!is.na(median_interval))
data.frame(
Event_Type = event_label,
Cohort_Median_Frequency = median(valid_data$median_interval, na.rm = TRUE),
Min_Patient_Median = min(valid_data$median_interval, na.rm = TRUE),
Max_Patient_Median = max(valid_data$median_interval, na.rm = TRUE),
Range_Patient_Median = max(valid_data$median_interval, na.rm = TRUE) -
min(valid_data$median_interval, na.rm = TRUE)
)
}
# ---- Combine all results ----
results <- bind_rows(
cohort_summary(ctDNA_patient_medians, "ctDNA"),
cohort_summary(imaging_patient_medians, "Imaging")
)
# ---- Print cohort-level summary ----
cat("===== Median Frequency (Days) per Cohort =====\n")
===== Median Frequency (Days) per Cohort =====
print(results, row.names = FALSE)
Event_Type Cohort_Median_Frequency Min_Patient_Median Max_Patient_Median Range_Patient_Median
ctDNA 75.75 14 156 142
Imaging 154.25 0 932 932
# ---- OPTIONAL: Save patient-level medians ----
patient_level_medians <- bind_rows(ctDNA_patient_medians, imaging_patient_medians)
#write.csv(patient_level_medians, "patient_median_intervals.csv", row.names = FALSE)
#Plot for individual lead-time calculations for each pt
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_Lead Time pts.csv"
df <- read.csv(csv_path, check.names = FALSE)
if ("Final.cohort" %in% names(df)) {
df <- df[df$Final.cohort == TRUE, ]
} else {
warning("Column 'Final.cohort' not found in the dataset.")
}
id_candidates <- c("Patient","ID","Pt","Subject","Sample")
id_col <- id_candidates[id_candidates %in% names(df)][1]
if (is.na(id_col)) {
df <- df %>% mutate(Patient = row_number())
id_col <- "Patient"
}
num_cols <- intersect(c("MR","CR","LT"), names(df))
df[num_cols] <- lapply(df[num_cols], function(x) suppressWarnings(as.numeric(x)))
# If LT missing, compute in days
if (!("LT" %in% names(df))) {
df <- df %>% mutate(LT = CR - MR)
}
# ---- convert to months ----
# Approximate: 30.44 days per month (average)
days_to_months <- function(x) x / 30.437
df <- df %>%
mutate(MR_mo = days_to_months(MR),
CR_mo = days_to_months(CR),
LT_mo = days_to_months(LT))
# ---- ordering for y-axis ----
df <- df %>%
mutate(Earliest = pmin(MR_mo, CR_mo, na.rm = TRUE)) %>%
arrange(Earliest) %>%
mutate(Patient_f = factor(.data[[id_col]], levels = .data[[id_col]]))
# ---- long format for points ----
points_long <- df %>%
select(Patient_f, MR_mo, CR_mo) %>%
pivot_longer(c(MR_mo, CR_mo), names_to = "Type", values_to = "Months") %>%
mutate(Type = dplyr::recode(Type,
"MR_mo" = "Molecular recurrence",
"CR_mo" = "Clinical recurrence"))
# ---- segments between MR and CR ----
segments_df <- df %>%
transmute(Patient_f,
x0 = MR_mo, x1 = CR_mo,
lt_flag = if_else(LT > 120, "gt120", "le120"))
# ---- annotation ----
med_lt <- median(df$LT_mo, na.rm = TRUE)
min_lt <- min(df$LT_mo, na.rm = TRUE)
max_lt <- max(df$LT_mo, na.rm = TRUE)
annot_label <- sprintf("Median lead-time:\n%.1f months (%.1f to %.1f)",
med_lt, min_lt, max_lt)
# ---- plot ----
pal <- c("Molecular recurrence" = "#10B4C1",
"Clinical recurrence" = "#C96A72")
x_max <- max(c(df$MR_mo, df$CR_mo), na.rm = TRUE)
y_mid <- levels(df$Patient_f)[ceiling(nlevels(df$Patient_f) * 0.55)]
p <- ggplot() +
geom_segment(data = segments_df,
aes(x = x0, xend = x1, y = Patient_f, yend = Patient_f,
linetype = lt_flag),
linewidth = 0.6, color = "black") +
scale_linetype_manual(values = c(le120 = "solid", gt120 = "dashed"),
guide = "none") +
geom_point(data = points_long,
aes(x = Months, y = Patient_f, color = Type),
size = 2.8) +
scale_color_manual(values = pal, name = NULL) +
labs(x = "Months from Surgery or Definitive Treatment", y = NULL) +
annotate("text",
x = x_max * 0.73,
y = y_mid,
label = annot_label,
hjust = 0, vjust = 0.5, size = 3.8) +
scale_x_continuous(breaks = seq(0, ceiling(x_max/3)*3, by = 3)) +
coord_cartesian(xlim = c(0, x_max * 1.05)) +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
axis.text.y = element_text(size = 9)
)
print(p)

wc_df <- df %>% dplyr::select(MR, CR) %>% tidyr::drop_na()
paired_diff <- wc_df$CR - wc_df$MR
wilx <- wilcox.test(
x = wc_df$CR, y = wc_df$MR,
paired = TRUE,
alternative = "two.sided",
conf.int = TRUE, # gives CI for the Hodges–Lehmann estimate of the median difference
exact = FALSE # safer for ties/large N
)
W_stat <- unname(wilx$statistic) # Wilcoxon V
p_value <- wilx$p.value
HL_est <- unname(wilx$estimate) # median of (CR - MR)
HL_ci <- wilx$conf.int # CI for median difference
# Simple p-value formatter for annotation/publication
fmt_p <- function(p) {
if (is.na(p)) return("NA")
if (p < 0.001) "< 0.001" else sprintf("= %.3f", round(p, 3))
}
p_text <- paste0("P ", fmt_p(p_value)) # e.g., "P = 0.003" or "P < 0.001"
# Optional: print a compact summary
cat("\nPaired Wilcoxon signed-rank test (CR vs MR)\n",
"-------------------------------------------\n",
sprintf("N pairs: %d", nrow(wc_df)), "\n",
sprintf("W (V): %g", W_stat), "\n",
sprintf("P-value: %s", ifelse(p_value < 0.001, "< 0.001", sprintf("%.6f", p_value))), "\n",
sprintf("Hodges–Lehmann median difference (CR - MR): %.1f days", HL_est), "\n",
sprintf("95%% CI: [%.1f, %.1f] days", HL_ci[1], HL_ci[2]), "\n", sep = "")
Paired Wilcoxon signed-rank test (CR vs MR)
-------------------------------------------
N pairs: 21
W (V): 231
P-value: < 0.001
Hodges–Lehmann median difference (CR - MR): 155.5 days
95% CI: [86.0, 241.5] days
#Multivariate cox regression at Surveillance Window for RFS
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"), labels = c("Negative", "Positive"))
circ_data$cStage <- factor(circ_data$cStage, levels = c("0/I", "II/III"))
circ_data$NAC <- factor(circ_data$NAC, levels = c("FALSE", "TRUE"))
circ_data$ACT <- factor(circ_data$ACT, levels = c("FALSE", "TRUE"))
surv_object <- Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event)
cox_fit <- coxph(surv_object ~ ctDNA.Surveillance + Age + cStage + NAC + ACT, data=circ_data)
ggforest(cox_fit, data = circ_data, main = "Multivariate Regression Model for RFS - All Stages", refLabel = "Reference Group")


test.ph <- cox.zph(cox_fit)
#ctDNA and MTM/mL Dynamics for pts at surveillance window
#Dynamics and MTM/mL plots for patients with ctDNA negative at surveillance
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("CLIA UTUC ctDNA MTM.csv", stringsAsFactors = FALSE)
df <- df[df$Final.cohort=="TRUE",]
df <- df[df$ctDNA.Surveillance=="NEGATIVE",]
df$RFS.Event <- ifelse(df$RFS.Event %in% c("No", "no", "FALSE", "False", "0"), FALSE,
ifelse(df$RFS.Event %in% c("Yes", "yes", "TRUE", "True", "1"), TRUE, NA))
df$RFS.Event <- factor(df$RFS.Event, levels = c(FALSE, TRUE))
df <- df %>%
group_by(PatientName) %>%
filter(n() >= 2) %>% #keep only pts with at least 2 post-surgery time points
ungroup()
num_unique <- length(unique(df$PatientName))
cat("Number of unique patients:", num_unique, "\n")
Number of unique patients: 17
df_patient_pfs <- df %>%
group_by(PatientName) %>%
dplyr::summarize(
PFS_True = any(RFS.Event == TRUE, na.rm = TRUE),
PFS_False = all(RFS.Event == FALSE, na.rm = TRUE)
)
num_true <- sum(df_patient_pfs$PFS_True)
num_false <- sum(df_patient_pfs$PFS_False)
cat("Number of unique patients with Event:", num_true, "\n")
Number of unique patients with Event: 3
cat("Number of unique patients with No Event:", num_false, "\n")
Number of unique patients with No Event: 14
p <- ggplot(df, aes(x = date.diff.months,
y = MTM.mL,
group = PatientName,
color = RFS.Event)) +
geom_line() + # Connect timepoints for each patient
geom_point() + # Add points for each timepoint
# Use a log10 scale for the y-axis with specified breaks
scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100, 1000),
labels = c("0.01","0.1", "1", "10", "100", "1000")) +
scale_x_continuous(breaks = seq(0, max(df$date.diff.months, na.rm = TRUE), by = 6)) +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
labs(
x = "Time Since Surgery or start of definitive treatment (months)",
y = "Mean Tumor Molecules per mL (MTM/mL)",
color = "RFS Event"
) +
theme_minimal()
print(p)

#Dynamics and MTM/mL plots for patients with ctDNA positive at surveillance
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("CLIA UTUC ctDNA MTM.csv", stringsAsFactors = FALSE)
df <- df[df$Final.cohort=="TRUE",]
df <- df[df$ctDNA.Surveillance=="POSITIVE",]
df$RFS.Event <- ifelse(df$RFS.Event %in% c("No", "no", "FALSE", "False", "0"), FALSE,
ifelse(df$RFS.Event %in% c("Yes", "yes", "TRUE", "True", "1"), TRUE, NA))
df$RFS.Event <- factor(df$RFS.Event, levels = c(FALSE, TRUE))
df <- df %>%
group_by(PatientName) %>%
filter(n() >= 2) %>% #keep only pts with at least 2 post-surgery time points
ungroup()
num_unique <- length(unique(df$PatientName))
cat("Number of unique patients:", num_unique, "\n")
Number of unique patients: 21
df_patient_pfs <- df %>%
group_by(PatientName) %>%
dplyr::summarize(
PFS_True = any(RFS.Event == TRUE, na.rm = TRUE),
PFS_False = all(RFS.Event == FALSE, na.rm = TRUE)
)
num_true <- sum(df_patient_pfs$PFS_True)
num_false <- sum(df_patient_pfs$PFS_False)
cat("Number of unique patients with Event:", num_true, "\n")
Number of unique patients with Event: 16
cat("Number of unique patients with No Event:", num_false, "\n")
Number of unique patients with No Event: 5
p <- ggplot(df, aes(x = date.diff.months,
y = MTM.mL,
group = PatientName,
color = RFS.Event)) +
geom_line() + # Connect timepoints for each patient
geom_point() + # Add points for each timepoint
# Use a log10 scale for the y-axis with specified breaks
scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100, 1000),
labels = c("0.01","0.1", "1", "10", "100", "1000")) +
scale_x_continuous(breaks = seq(0, max(df$date.diff.months, na.rm = TRUE), by = 6)) +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
labs(
x = "Time Since Surgery or start of definitive treatment (months)",
y = "Mean Tumor Molecules per mL (MTM/mL)",
color = "RFS Event"
) +
theme_minimal()
print(p)

#ctDNA velocity and lead time liner regression
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_ctDNA velocity.csv" # <- adjust if needed
df_raw <- read.csv(csv_path, check.names = FALSE)
df <- df_raw %>%
rename(MTM_mL = `MTM.mL`) %>%
mutate(
daysCR.months = as.numeric(daysCR.months),
MTM_mL = as.numeric(MTM_mL)
) %>%
filter(Final.cohort == TRUE, !is.na(PatientName), !is.na(daysCR.months), !is.na(MTM_mL))
preCR <- df %>%
filter(daysCR.months <= 0, is.finite(MTM_mL), MTM_mL > 0)
eligible <- preCR %>%
group_by(PatientName) %>%
filter(n() >= 2, n_distinct(daysCR.months) >= 2) %>%
ungroup()
if (nrow(eligible) == 0) {
warning("No patients have ≥2 valid pre-recurrence points with distinct times; regression lines will be omitted.")
}
fits <- eligible %>%
group_by(PatientName) %>%
summarise(x_min = min(daysCR.months, na.rm = TRUE), .groups = "drop") %>%
mutate(grid = map(x_min, ~seq(.x, 0, length.out = 50))) %>%
select(PatientName, grid)
predict_patient <- function(dat, newx) {
if (length(newx) == 0) {
return(tibble(daysCR.months = numeric(0), MTM_mL = numeric(0)))
}
dat2 <- dat %>%
filter(is.finite(MTM_mL), MTM_mL > 0) %>%
mutate(log_ctdna = log10(MTM_mL))
if (nrow(dat2) < 2 || n_distinct(dat2$daysCR.months) < 2 || any(!is.finite(dat2$log_ctdna))) {
return(tibble(daysCR.months = numeric(0), MTM_mL = numeric(0)))
}
m <- lm(log_ctdna ~ daysCR.months, data = dat2)
tibble(
daysCR.months = newx,
MTM_mL = 10 ^ predict(m, newdata = tibble(daysCR.months = newx))
)
}
pred_lines <- eligible %>%
group_by(PatientName) %>%
tidyr::nest() %>% # list-column "data" per patient
left_join(fits, by = "PatientName") %>% # list-column "grid" per patient
mutate(pred = map2(data, grid, ~predict_patient(.x, .y))) %>%
select(PatientName, pred) %>%
tidyr::unnest(pred)
pooled_line <- {
if (nrow(preCR) >= 2 && n_distinct(preCR$daysCR.months) >= 2) {
pooled_x <- seq(min(preCR$daysCR.months, na.rm = TRUE), 0, length.out = 100)
pooled_fit <- lm(log10(MTM_mL) ~ daysCR.months, data = preCR)
tibble(
daysCR.months = pooled_x,
MTM_mL = 10 ^ predict(pooled_fit, newdata = tibble(daysCR.months = pooled_x))
)
} else {
tibble(daysCR.months = numeric(0), MTM_mL = numeric(0))
}
}
x_min <- floor(min(df$daysCR.months, na.rm = TRUE) / 3) * 3
x_max <- ceiling(max(df$daysCR.months, na.rm = TRUE) / 3) * 3
y_min_pos <- max(min(df$MTM_mL[df$MTM_mL > 0], na.rm = TRUE) / 2, 0.01)
y_max_pos <- 10 ^ ceiling(log10(max(df$MTM_mL, na.rm = TRUE)))
log_breaks <- 10 ^ seq(floor(log10(y_min_pos)), ceiling(log10(y_max_pos)))
p <- ggplot() +
# per-patient fitted lines (if any)
geom_line(data = pred_lines,
aes(x = daysCR.months, y = MTM_mL, color = PatientName),
linewidth = 1) +
# pooled dashed trend (if any)
geom_line(data = pooled_line,
aes(x = daysCR.months, y = MTM_mL),
linewidth = 0.8, linetype = "dashed", color = "grey35") +
# raw points (all timepoints, before and after)
geom_point(data = df,
aes(x = daysCR.months, y = MTM_mL, color = PatientName),
size = 1.8, alpha = 0.85) +
# vertical line at imaging-positive date
geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.8, color = "black") +
scale_y_log10(breaks = log_breaks,
labels = label_number(accuracy = 1)) +
scale_x_continuous(breaks = seq(x_min, x_max, by = 3)) + # every 3 months
labs(
x = "Months before/after clinical recurrence (0 = imaging positive)",
y = "ctDNA level (MTM/mL)",
color = "Patient"
) +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
legend.position = "none" # change to "right" if you want the legend
)
print(p)

#Time from first ctDNA positive timepoint to last imaging in
surveillance_False Positive patients
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_Time from ctDNA to imaging FP pts.csv"
df <- read.csv(csv_path, check.names = FALSE)
if ("Final.cohort" %in% names(df)) {
df <- df[df$Final.cohort == TRUE, ]
} else {
warning("Column 'Final.cohort' not found in the dataset.")
}
id_candidates <- c("Patient","ID","Pt","Subject","Sample")
id_col <- id_candidates[id_candidates %in% names(df)][1]
if (is.na(id_col)) {
df <- df %>% mutate(Patient = row_number())
id_col <- "Patient"
}
num_cols <- intersect(c("MR","CR","LT"), names(df))
df[num_cols] <- lapply(df[num_cols], function(x) suppressWarnings(as.numeric(x)))
# If LT missing, compute in days
if (!("LT" %in% names(df))) {
df <- df %>% mutate(LT = CR - MR)
}
# ---- convert to months ----
# Approximate: 30.44 days per month (average)
days_to_months <- function(x) x / 30.437
df <- df %>%
mutate(MR_mo = days_to_months(MR),
CR_mo = days_to_months(CR),
LT_mo = days_to_months(LT))
# ---- ordering for y-axis ----
df <- df %>%
mutate(Earliest = pmin(MR_mo, CR_mo, na.rm = TRUE)) %>%
arrange(Earliest) %>%
mutate(Patient_f = factor(.data[[id_col]], levels = .data[[id_col]]))
# ---- long format for points ----
points_long <- df %>%
select(Patient_f, MR_mo, CR_mo) %>%
pivot_longer(c(MR_mo, CR_mo), names_to = "Type", values_to = "Months") %>%
mutate(Type = dplyr::recode(Type,
"MR_mo" = "First ctDNA positive",
"CR_mo" = "Last Radiological assessment"))
# ---- segments between MR and CR ----
segments_df <- df %>%
transmute(Patient_f,
x0 = MR_mo, x1 = CR_mo,
lt_flag = if_else(LT > 120, "gt120", "le120"))
# ---- annotation ----
med_lt <- median(df$LT_mo, na.rm = TRUE)
min_lt <- min(df$LT_mo, na.rm = TRUE)
max_lt <- max(df$LT_mo, na.rm = TRUE)
annot_label <- sprintf("Median time from ctDNA positive to last imaging:\n%.1f months (%.1f to %.1f)",
med_lt, min_lt, max_lt)
# ---- plot ----
pal <- c("First ctDNA positive" = "black",
"Last Radiological assessment" = "red")
x_max <- max(c(df$MR_mo, df$CR_mo), na.rm = TRUE)
y_mid <- levels(df$Patient_f)[ceiling(nlevels(df$Patient_f) * 0.55)]
p <- ggplot() +
geom_segment(data = segments_df,
aes(x = x0, xend = x1, y = Patient_f, yend = Patient_f,
linetype = lt_flag),
linewidth = 0.6, color = "black") +
scale_linetype_manual(values = c(le120 = "solid", gt120 = "dashed"),
guide = "none") +
geom_point(data = points_long,
aes(x = Months, y = Patient_f, color = Type),
size = 2.8) +
scale_color_manual(values = pal, name = NULL) +
labs(x = "Months from ctDNA positivity", y = NULL) +
annotate("text",
x = x_max * 0.73,
y = y_mid,
label = annot_label,
hjust = 0, vjust = 0.5, size = 3.8) +
scale_x_continuous(breaks = seq(0, ceiling(x_max/3)*3, by = 3)) +
coord_cartesian(xlim = c(0, x_max * 1.05)) +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
axis.text.y = element_text(size = 9)
)
print(p)

wc_df <- df %>% dplyr::select(MR, CR) %>% tidyr::drop_na()
paired_diff <- wc_df$CR - wc_df$MR
wilx <- wilcox.test(
x = wc_df$CR, y = wc_df$MR,
paired = TRUE,
alternative = "two.sided",
conf.int = TRUE, # gives CI for the Hodges–Lehmann estimate of the median difference
exact = FALSE # safer for ties/large N
)
W_stat <- unname(wilx$statistic) # Wilcoxon V
p_value <- wilx$p.value
HL_est <- unname(wilx$estimate) # median of (CR - MR)
HL_ci <- wilx$conf.int # CI for median difference
# Simple p-value formatter for annotation/publication
fmt_p <- function(p) {
if (is.na(p)) return("NA")
if (p < 0.001) "< 0.001" else sprintf("= %.3f", round(p, 3))
}
p_text <- paste0("P ", fmt_p(p_value)) # e.g., "P = 0.003" or "P < 0.001"
# Optional: print a compact summary
cat("\nPaired Wilcoxon signed-rank test (CR vs MR)\n",
"-------------------------------------------\n",
sprintf("N pairs: %d", nrow(wc_df)), "\n",
sprintf("W (V): %g", W_stat), "\n",
sprintf("P-value: %s", ifelse(p_value < 0.001, "< 0.001", sprintf("%.6f", p_value))), "\n",
sprintf("Hodges–Lehmann median difference (CR - MR): %.1f days", HL_est), "\n",
sprintf("95%% CI: [%.1f, %.1f] days", HL_ci[1], HL_ci[2]), "\n", sep = "")
Paired Wilcoxon signed-rank test (CR vs MR)
-------------------------------------------
N pairs: 4
W (V): 10
P-value: 0.100348
Hodges–Lehmann median difference (CR - MR): 27.7 days
95% CI: [6.0, 291.0] days
#Individual Pt Plots_False Positive patients
#Filter for Patient UTUC-010
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Pt Specific Plot data.csv")
plot_data <- circ_data %>%
filter(PatientName == "UTUC-010")
# 2. Process layers
# Handle ctDNA log-scale pseudo-zero
data_Signatera <- plot_data %>%
filter(Event_type %in% c("ctDNA_pos", "ctDNA_neg")) %>%
mutate(MTM_Log = ifelse(MTM.mL == 0, 0.001, MTM.mL))
data_Imaging <- plot_data %>%
filter(Event_type == "Imaging")
# Treatment blocks
data_Treatment <- plot_data %>%
filter(!is.na(Tx_type) & Tx_type != "NA" & !is.na(Tx_start.months)) %>%
distinct(Tx_type, Tx_start.months, Tx_end.months)
# 3. Create the Plot
ggplot() +
# Treatment Rectangles
geom_rect(data = data_Treatment,
aes(xmin = Tx_start.months, xmax = Tx_end.months, ymin = 0.004, ymax = 10),
fill = "lightblue", alpha = 0.15) +
geom_text(data = data_Treatment,
aes(x = (Tx_start.months + Tx_end.months)/2, y = 5, label = Tx_type),
angle = 90, color = "black", size = 3, fontface = "bold") +
# ctDNA Line
geom_line(data = data_Signatera,
aes(x = date.diff.months, y = MTM_Log), color = "black", linewidth = 0.7) +
# ctDNA Points (Mapped to Fill)
geom_point(data = data_Signatera,
aes(x = date.diff.months, y = MTM_Log, fill = Event_type),
shape = 21, size = 3.5, color = "black") +
# Imaging Points (Mapped to Shape and Color to create a second legend)
geom_point(data = data_Imaging,
aes(x = date.diff.months, y = 0.005, color = "NED in Scan"),
shape = 25, size = 4, fill = "darkgreen") +
# X-Axis: 3-month intervals
scale_x_continuous(breaks = seq(0, max(plot_data$date.diff.months, na.rm=T) + 3, by = 3)) +
# Y-Axis: Log10 scale
scale_y_log10(breaks = c(0.001, 0.01, 0.1, 1, 10),
labels = c("0", "0.01", "0.1", "1", "10"),
limits = c(0.001, 15)) +
# Legend for ctDNA
scale_fill_manual(values = c("ctDNA_pos" = "black", "ctDNA_neg" = "white"),
labels = c("Negative", "Positive"),
name = "ctDNA Status") +
# Legend for Imaging
scale_color_manual(values = c("NED in Scan" = "darkgreen"),
name = "Imaging Result") +
# Styling the legend to show the triangle correctly
guides(color = guide_legend(override.aes = list(shape = 25, fill = "darkgreen"))) +
labs(x = "Months from Surgery",
y = "MTM/mL",
title = "Clinical Timeline: UTUC-010") +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
legend.position = "right")

#Filter for Patient UTUC-011
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Pt Specific Plot data.csv")
plot_data <- circ_data %>%
filter(PatientName == "UTUC-011")
# 2. Process layers
# Handle ctDNA log-scale pseudo-zero
data_Signatera <- plot_data %>%
filter(Event_type %in% c("ctDNA_pos", "ctDNA_neg")) %>%
mutate(MTM_Log = ifelse(MTM.mL == 0, 0.001, MTM.mL))
data_Imaging <- plot_data %>%
filter(Event_type == "Imaging")
# Treatment blocks
data_Treatment <- plot_data %>%
filter(!is.na(Tx_type) & Tx_type != "NA" & !is.na(Tx_start.months)) %>%
distinct(Tx_type, Tx_start.months, Tx_end.months)
# 3. Create the Plot
ggplot() +
# Treatment Rectangles
geom_rect(data = data_Treatment,
aes(xmin = Tx_start.months, xmax = Tx_end.months, ymin = 0.004, ymax = 10),
fill = "lightblue", alpha = 0.15) +
geom_text(data = data_Treatment,
aes(x = (Tx_start.months + Tx_end.months)/2, y = 5, label = Tx_type),
angle = 90, color = "black", size = 3, fontface = "bold") +
# ctDNA Line
geom_line(data = data_Signatera,
aes(x = date.diff.months, y = MTM_Log), color = "black", linewidth = 0.7) +
# ctDNA Points (Mapped to Fill)
geom_point(data = data_Signatera,
aes(x = date.diff.months, y = MTM_Log, fill = Event_type),
shape = 21, size = 3.5, color = "black") +
# Imaging Points (Mapped to Shape and Color to create a second legend)
geom_point(data = data_Imaging,
aes(x = date.diff.months, y = 0.005, color = "NED in Scan"),
shape = 25, size = 4, fill = "darkgreen") +
# X-Axis: 3-month intervals
scale_x_continuous(breaks = seq(0, max(plot_data$date.diff.months, na.rm=T) + 3, by = 3)) +
# Y-Axis: Log10 scale
scale_y_log10(breaks = c(0.001, 0.01, 0.1, 1, 10),
labels = c("0", "0.01", "0.1", "1", "10"),
limits = c(0.001, 15)) +
# Legend for ctDNA
scale_fill_manual(values = c("ctDNA_pos" = "black", "ctDNA_neg" = "white"),
labels = c("Negative", "Positive"),
name = "ctDNA Status") +
# Legend for Imaging
scale_color_manual(values = c("NED in Scan" = "darkgreen"),
name = "Imaging Result") +
# Styling the legend to show the triangle correctly
guides(color = guide_legend(override.aes = list(shape = 25, fill = "darkgreen"))) +
labs(x = "Months from Surgery",
y = "MTM/mL",
title = "Clinical Timeline: UTUC-011") +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
legend.position = "right")

---
title: "Ayanambakkam et al_ctDNA in UTUC Clinical Analysis"
output: html_notebook
---
library(swimplot)
library(grid)
library(gtable)
library(readr) 
library(mosaic)
library(dplyr) 
library(survival) 
library(survminer) 
library(ggplot2)
library(scales)
library(coxphf)
library(ggthemes)
library(tidyverse)
library(gtsummary)
library(flextable)
library(parameters)
library(car)
library(ComplexHeatmap)
library(tidyverse)
library(readxl)
library(janitor)
library(DT)
library(pROC)
library(rms)

#ctDNA Detection Rates by Window and Stages
```{r}
#ctDNA at MRD by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
circ_data <- subset(circ_data, ctDNA.MRD %in% c("NEGATIVE", "POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.MRD == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.MRD, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
  Stage = total_counts_by_stage$Group.1,
  Total_Count = total_counts_by_stage$x,
  Positive_Count = positive_counts_by_stage$x,
  Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100  # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.MRD == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100  # Convert to percentage
overall_row <- data.frame(
  Stage = "Overall",
  Total_Count = overall_total_count,
  Positive_Count = overall_positive_count,
  Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)

#ctDNA at MRD by NAC status
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
circ_data <- subset(circ_data, ctDNA.MRD %in% c("NEGATIVE", "POSITIVE"))
circ_data$NAC <- factor(circ_data$NAC, levels=c("FALSE","TRUE"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.MRD == "POSITIVE", by=list(circ_data$NAC), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.MRD, by=list(circ_data$NAC), FUN=length)
combined_data <- data.frame(
  Stage = total_counts_by_stage$Group.1,
  Total_Count = total_counts_by_stage$x,
  Positive_Count = positive_counts_by_stage$x,
  Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100  # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.MRD == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100  # Convert to percentage
overall_row <- data.frame(
  Stage = "Overall",
  Total_Count = overall_total_count,
  Positive_Count = overall_positive_count,
  Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)

#ctDNA at Surveillance by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.Surveillance == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.Surveillance, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
  Stage = total_counts_by_stage$Group.1,
  Total_Count = total_counts_by_stage$x,
  Positive_Count = positive_counts_by_stage$x,
  Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100  # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.Surveillance == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100  # Convert to percentage
overall_row <- data.frame(
  Stage = "Overall",
  Total_Count = overall_total_count,
  Positive_Count = overall_positive_count,
  Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)

#ctDNA at anytime by Stage
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.anytime!="",]
circ_data$ctDNA.anytime <- factor(circ_data$ctDNA.anytime, levels=c("NEGATIVE","POSITIVE"))
circ_data$Stage <- factor(circ_data$Stage, levels=c("0/I","II","III"))
positive_counts_by_stage <- aggregate(circ_data$ctDNA.anytime == "POSITIVE", by=list(circ_data$Stage), FUN=sum)
total_counts_by_stage <- aggregate(circ_data$ctDNA.anytime, by=list(circ_data$Stage), FUN=length)
combined_data <- data.frame(
  Stage = total_counts_by_stage$Group.1,
  Total_Count = total_counts_by_stage$x,
  Positive_Count = positive_counts_by_stage$x,
  Rate = (positive_counts_by_stage$x / total_counts_by_stage$x) * 100  # Convert to percentage
)
combined_data$Rate <- sprintf("%.2f%%", combined_data$Rate)
overall_total_count <- nrow(circ_data)
overall_positive_count <- nrow(circ_data[circ_data$ctDNA.anytime == "POSITIVE",])
overall_positivity_rate <- (overall_positive_count / overall_total_count) * 100  # Convert to percentage
overall_row <- data.frame(
  Stage = "Overall",
  Total_Count = overall_total_count,
  Positive_Count = overall_positive_count,
  Rate = sprintf("%.2f%%", overall_positivity_rate)
)
combined_data <- rbind(combined_data, overall_row)
print(combined_data)
```



#Demographics Table
```{r}
rm(list=ls())
setwd("~/Downloads") 
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]

circ_data_subset <- circ_data %>%
  select(
    Age,
    Sex,
    Stage,
    NAC,
    ACT,
    ACT.type,
    RFS.Event,
    OS.Event,
    OS.months) %>%
  mutate(
    Age = as.numeric(Age),
    Sex = factor(Sex),
    Stage = factor(Stage),
    NAC = factor(NAC),
    ACT = factor(ACT),
    ACT.type = factor(ACT.type),
    RFS.Event = factor(RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence")),
    OS.Event = factor(OS.Event, levels = c("FALSE", "TRUE"), labels = c("Alive", "Deceased")),
    OS.months = as.numeric(OS.months)) 
table1 <- circ_data_subset %>%
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{median} ({min} - {max})",
      all_categorical() ~ "{n} ({p}%)")) %>%
  bold_labels()
table1
fit1 <- as_flex_table(
  table1,
  include = everything(),
  return_calls = FALSE)
fit1
save_as_docx(fit1, path= "~/Downloads/1. Demographics Table_RWE UTUC.docx")
```



#Overview plot
```{r}
rm(list=ls())
setwd("~/Downloads") 
clinstage<- read.csv("RWE UTUC_OP 102025.csv")
clinstage <- clinstage[clinstage$Final.cohort=="TRUE",]
clinstage_df<- as.data.frame(clinstage)

#Display the swimmer plot with the label box
oplot<-swimmer_plot(df=clinstage_df,
                    id='PatientName',
                    end='fu.diff.months',
                    fill='gray',
                    width=.01,)
oplot <- oplot + theme(panel.border = element_blank())
oplot <- oplot + scale_y_continuous(breaks = seq(0, 96, by = 3))
oplot <- oplot + labs(x ="Patients" , y="Time from Surgery (Months)")
oplot


##plot events
oplot_ev1 <- oplot + swimmer_points(df_points=clinstage_df,
                                    id='PatientName',
                                    time='date.diff.months',
                                    name_shape ='Event_type',
                                    name_col = 'Event',
                                    size=3.5,fill='black',
                                    #col='darkgreen'
)
oplot_ev1

#Shape customization to Event_type

oplot_ev1.1 <- oplot_ev1 + ggplot2::scale_shape_manual(name="Event_type",values=c(1,16,6,18,4),breaks=c('ctDNA_neg','ctDNA_pos','Imaging','Surgery','Death'))
oplot_ev1.1

#plot treatment

oplot_ev2 <- oplot_ev1.1 + swimmer_lines(df_lines=clinstage_df,
                                         id='PatientName',
                                         start='Tx_start.months',
                                         end='Tx_end.months',
                                         name_col='Tx_type',
                                         size=3.5,
                                         name_alpha = 1.0)
oplot_ev2 <- oplot_ev2 + guides(linetype = guide_legend(override.aes = list(size = 5, color = "black")))
oplot_ev2  

#colour customization
oplot_ev2.2 <- oplot_ev2 + ggplot2::scale_color_manual(name="Event",values=c( "lightblue","purple","green", "black", "black", "black", "lightblue","lightblue","green","red","green","green","blue","lightblue", "blue", "blue"))
oplot_ev2.2
```

#RFS by ctDNA status at MRD Window
```{r}
rm(list=ls())
setwd("~/Downloads") 
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.MRD!="",]
circ_data$RFS.MRD <- circ_data$RFS - circ_data$ctDNA.timing
circ_data$RFS.MRD.months <- circ_data$RFS.MRD / 30.437

survfit(Surv(time = circ_data$RFS.MRD.months, event = circ_data$RFS.Event)~ctDNA.MRD, data = circ_data)
event_summary <- circ_data %>%
  group_by(ctDNA.MRD) %>%
  summarise(
    Total = n(),
    Events = sum(RFS.Event),
    Fraction = Events / n(),
    Percentage = (Events / n()) * 100
  )
print(event_summary)
surv_object <-Surv(time = circ_data$RFS.MRD.months, event = circ_data$RFS.Event)
KM_curve <- survfit(surv_object ~ ctDNA.MRD, data = circ_data,conf.int=0.95,conf.type="log-log") 
ggsurvplot(KM_curve, data = circ_data, pval = FALSE, conf.int = FALSE, risk.table = TRUE, break.time.by=3, palette=c("blue","red"), title="RFS - ctDNA at the MRD Window", ylab= "Recurrence-Free Survival", xlab="Time from Landmark Time point (months)", legend.labs=c("ctDNA Negative", "ctDNA Positive"), legend.title="")
summary(KM_curve, times= c(0, 12, 18))
circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels=c("NEGATIVE","POSITIVE"))
cox_fit <- coxph(surv_object ~ ctDNA.MRD, data=circ_data) 
ggforest(cox_fit,data = circ_data) 
summary(cox_fit)
cox_fit_summary <- summary(cox_fit)

#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)

circ_data$ctDNA.MRD <- factor(circ_data$ctDNA.MRD, levels = c("NEGATIVE", "POSITIVE"), labels = c("Negative", "Positive"))
circ_data$RFS.Event <- factor(circ_data$RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence"))
contingency_table <- table(circ_data$ctDNA.MRD, circ_data$RFS.Event)
chi_square_test <- chisq.test(contingency_table)
print(chi_square_test)
fisher_exact_test <- fisher.test(contingency_table)
print(fisher_exact_test)
print(contingency_table)
table_df <- as.data.frame(contingency_table)
table_df$Total <- ave(table_df$Freq, table_df$Var1, FUN = sum)
table_df$Percentage <- table_df$Freq / table_df$Total
table_df$MiddlePercentage <- table_df$Percentage / 2
ggplot(table_df, aes(x = Var1, y = Percentage, fill = Var2)) +
  geom_bar(stat = "identity") +
  geom_text(aes(y = MiddlePercentage, label = Freq), position = "stack", color = "black", vjust = 1.5, size = 7) +
  theme_minimal() +
  labs(title = "ctDNA status at MRD", 
       x = "ctDNA", 
       y = "Patients (%)", 
       fill = "Recurrence",
       caption = paste("Fisher's exact test p-value: ", format.pval(fisher_exact_test$p.value))) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("No Recurrence" = "blue", "Recurrence" = "red")) + # define custom colors
  theme(axis.text.x = element_text(angle = 0, hjust = 1.5, size = 14), # increase x-axis text size
        axis.text.y = element_text(size = 14, color = "black"), # increase y-axis text size
        axis.title.x = element_text(size = 14, color = "black"), # increase x-axis label size
        axis.title.y = element_text(size = 14, color = "black"), # increase y-axis label size
        legend.text = element_text(size = 12, color = "black"))  # increase Recurrence label size
```

#RFS by ctDNA status at Surveillance Window
```{r}
rm(list=ls())
setwd("~/Downloads") 
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]

survfit(Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event)~ctDNA.Surveillance, data = circ_data)
event_summary <- circ_data %>%
  group_by(ctDNA.Surveillance) %>%
  summarise(
    Total = n(),
    Events = sum(RFS.Event),
    Fraction = Events / n(),
    Percentage = (Events / n()) * 100
  )
print(event_summary)
surv_object <-Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event)
KM_curve <- survfit(surv_object ~ ctDNA.Surveillance, data = circ_data,conf.int=0.95,conf.type="log-log") 
ggsurvplot(KM_curve, data = circ_data, pval = FALSE, conf.int = FALSE, risk.table = TRUE, break.time.by=6, palette=c("blue","red"), title="RFS - ctDNA at the Surveillance Window", ylab= "Recurrence-Free Survival", xlab="Time from Surgery (months)", legend.labs=c("ctDNA Negative", "ctDNA Positive"), legend.title="")
summary(KM_curve, times= c(0, 12, 24))
circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"))
cox_fit <- coxph(surv_object ~ ctDNA.Surveillance, data=circ_data) 
ggforest(cox_fit,data = circ_data) 
summary(cox_fit)
cox_fit_summary <- summary(cox_fit)

#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)

circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels = c("NEGATIVE", "POSITIVE"), labels = c("Negative", "Positive"))
circ_data$RFS.Event <- factor(circ_data$RFS.Event, levels = c("FALSE", "TRUE"), labels = c("No Recurrence", "Recurrence"))
contingency_table <- table(circ_data$ctDNA.Surveillance, circ_data$RFS.Event)
chi_square_test <- chisq.test(contingency_table)
print(chi_square_test)
fisher_exact_test <- fisher.test(contingency_table)
print(fisher_exact_test)
print(contingency_table)
table_df <- as.data.frame(contingency_table)
table_df$Total <- ave(table_df$Freq, table_df$Var1, FUN = sum)
table_df$Percentage <- table_df$Freq / table_df$Total
table_df$MiddlePercentage <- table_df$Percentage / 2
ggplot(table_df, aes(x = Var1, y = Percentage, fill = Var2)) +
  geom_bar(stat = "identity") +
  geom_text(aes(y = MiddlePercentage, label = Freq), position = "stack", color = "black", vjust = 1.5, size = 7) +
  theme_minimal() +
  labs(title = "ctDNA status at Surveillance", 
       x = "ctDNA", 
       y = "Patients (%)", 
       fill = "Recurrence",
       caption = paste("Fisher's exact test p-value: ", format.pval(fisher_exact_test$p.value))) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("No Recurrence" = "blue", "Recurrence" = "red")) + # define custom colors
  theme(axis.text.x = element_text(angle = 0, hjust = 1.5, size = 14), # increase x-axis text size
        axis.text.y = element_text(size = 14, color = "black"), # increase y-axis text size
        axis.title.x = element_text(size = 14, color = "black"), # increase x-axis label size
        axis.title.y = element_text(size = 14, color = "black"), # increase y-axis label size
        legend.text = element_text(size = 12, color = "black"))  # increase Recurrence label size
```

#Time-dependent analysis for RFS in longitudinal time points
```{r}
rm(list=ls())
setwd("~/Downloads")
dt <- read_xlsx("CLIA UTUC Clinical Data_Time dependent.xlsx") |>
  clean_names() |>
  mutate(across(
    .cols = c(window_start_date, dfs_date, surveillance_1_date:surveillance_12_date),
    .fns = ~ as_date(as.Date(.x, format = "%Y-%m-%d"))
  )) |>
  filter(final_cohort == TRUE)

dt_biomarker <- dt |>
  select(pts_id, ct_dna_surveillance_available,
         window_start_date,
         surveillance_1_status:surveillance_12_date) |>
  filter(ct_dna_surveillance_available) |>
  pivot_longer(cols = surveillance_1_status:surveillance_12_date,
               names_to = c("visit_number", ".value"),
               names_pattern = "surveillance_(.)_(.*)") |>
  mutate(biomarker_time = day(days(date - window_start_date))) |>
  select(pts_id, biomarker_time, biomarker_status = status) |>
  filter(!is.na(biomarker_time))

glimpse(dt_biomarker)

dt_survival <- dt |>
  select(pts_id, ct_dna_surveillance_available,
         window_start_date:dfs_date, dfs_event) |>  # Added dfs_event here
  filter(ct_dna_surveillance_available) |>
  mutate(dfs_time = (dfs_date - window_start_date),
         dfs_time = day(days(dfs_time)),
         dfs_event = as.numeric(dfs_event)) |>
  select(pts_id, dfs_time, dfs_event)

glimpse(dt_survival)

aux <- dt_survival %>% 
  filter(dfs_time <= 0)

tab <- left_join(aux, dt) |>
  select(pts_id, window_start_date, dfs_time, dfs_date,
         surveillance_1_date:surveillance_12_date) |>
  mutate(across(.cols = dfs_date:surveillance_12_date, 
                .fns = ~ as_date(.x))) |>
  select(pts_id, window_start_date, dfs_date, dfs_time)

datatable(tab, filter = "top")

dt_survival <- dt_survival |>
  filter(dfs_time > 0)

aux <- dt |>
  select(pts_id, ct_dna_surveillance_available,
         window_start_date, dfs_date,
         surveillance_1_date:surveillance_12_date) |>
  mutate(across(.cols = surveillance_1_date:surveillance_12_date, 
                .fns = ~ .x - window_start_date)) |>
  mutate(across(.cols = surveillance_1_date:surveillance_12_date, 
                .fns = ~ .x < 0)) |>
  rowwise() |>
  mutate(sum_neg = 
           sum(c_across(surveillance_1_date:surveillance_12_date),
               na.rm = TRUE))  |>
  select(pts_id, sum_neg)

tab <- left_join(aux, dt) |>
  filter(sum_neg > 0) |>
  select(pts_id, sum_neg, window_start_date,
         surveillance_1_date:surveillance_12_date) |>
  mutate(across(.cols = window_start_date:surveillance_12_date, 
                .fns = ~ as_date(.x))) 

datatable(tab, filter = "top")

aux <- dt |>
  select(pts_id, ct_dna_surveillance_available,
         window_start_date, dfs_date,
         surveillance_1_date:surveillance_12_date) |>
  mutate(across(.cols = dfs_date:surveillance_12_date, 
                .fns = ~ .x - window_start_date)) |>
  mutate(across(.cols = surveillance_2_date:surveillance_12_date,
                .fns = ~ dfs_date < .x)) |>
  rowwise() |>
  mutate(n_biomarker_after_event = sum(c_across(surveillance_2_date:
                                                  surveillance_12_date), 
                                       na.rm = TRUE)) |>
  mutate(across(.cols = surveillance_1_date:surveillance_12_date,
                .fns = ~ !is.na(.x))) |>
  mutate(total_biomarker = sum(c_across(surveillance_2_date:
                                          surveillance_12_date), 
                               na.rm = TRUE)) |>
  select(pts_id, n_biomarker_after_event, total_biomarker)

temp <- aux |> 
  select(-pts_id) |> 
  group_by(n_biomarker_after_event, total_biomarker) |>  # Direct grouping
  summarise(freq = n(), .groups = "drop")  # Drop groups after summarization


tab <- left_join(aux, dt) |>
  select(pts_id, n_biomarker_after_event, total_biomarker, 
         dfs_date,
         surveillance_2_date:surveillance_12_date) |>
  mutate(across(.cols = dfs_date:surveillance_12_date, 
                .fns = ~ as_date(.x))) |>
  filter(n_biomarker_after_event > 0)
datatable(tab, filter = "top")

aux <- tmerge(data1 = dt_survival, 
              data2 = dt_survival,
              id = pts_id, 
              dfs_event = event(dfs_time, dfs_event))
dt_final <- tmerge(data1 = aux, 
                   data2 = dt_biomarker,
                   id = pts_id, 
                   biomarker_status = 
                     tdc(biomarker_time, biomarker_status))

datatable(dt_final, filter = "top")

# Syntax if there is not time-dependent covariate
# fit <- coxph(Surv(dfs_time, dfs_event) ~ biomarker_status,
#              data = dt_final)
# summary(fit)

fit <- coxph(Surv(tstart, tstop, dfs_event) ~ biomarker_status,
             data = dt_final)
summary(fit)
cox_fit_summary <- summary(fit)

#Extract values for HR, 95% CI, and p-value
HR <- cox_fit_summary$coefficients[2]
lower_CI <- cox_fit_summary$conf.int[3]
upper_CI <- cox_fit_summary$conf.int[4]
p_value <- cox_fit_summary$coefficients[5]
label_text <- paste0("HR = ", round(HR, 2), " (", round(lower_CI, 2), "-", round(upper_CI, 2), "); p = ", round(p_value, 3))
print(label_text)
```


#Median numbers of time points in the longitudinal setting
```{r}
# Load the dataset
rm(list=ls())
setwd("~/Downloads") 
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]
circ_datadf <- as.data.frame(circ_data)

median_Nsurvtps <- median(circ_datadf$Nsurvtps, na.rm = TRUE)
min_Nsurvtps <- min(circ_datadf$Nsurvtps, na.rm = TRUE)
max_Nsurvtps <- max(circ_datadf$Nsurvtps, na.rm = TRUE)

cat(sprintf("Median # of surveillance time points: %d (%d-%d)\n", 
            median_Nsurvtps, min_Nsurvtps, max_Nsurvtps))
```

#Median of median interval of ctDNA timepoints and radiological imaging assessment
```{r}
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("RWE UTUC_OP 102025.csv", stringsAsFactors = FALSE)

names(df) <- trimws(names(df))

# ---- Helper: compute median interval per patient ----
median_interval_per_patient <- function(data, filter_col, filter_val) {
  data %>%
    filter(!!sym(filter_col) == filter_val) %>%
    arrange(PatientName, date.diff) %>%
    group_by(PatientName) %>%
    mutate(interval = date.diff - lag(date.diff)) %>%
    summarise(median_interval = median(interval, na.rm = TRUE), .groups = "drop") %>%
    mutate(event_group = filter_val)
}

# ---- Compute per-patient medians ----
ctDNA_patient_medians   <- median_interval_per_patient(df, "Event", "ctDNA")
imaging_patient_medians <- median_interval_per_patient(df, "Event_type", "Imaging")

# ---- Function to summarize patient-level medians to cohort-level ----
cohort_summary <- function(patient_data, event_label) {
  # Filter out patients with NA median intervals
  valid_data <- patient_data %>% filter(!is.na(median_interval))
  
  data.frame(
    Event_Type = event_label,
    Cohort_Median_Frequency = median(valid_data$median_interval, na.rm = TRUE),
    Min_Patient_Median = min(valid_data$median_interval, na.rm = TRUE),
    Max_Patient_Median = max(valid_data$median_interval, na.rm = TRUE),
    Range_Patient_Median = max(valid_data$median_interval, na.rm = TRUE) - 
      min(valid_data$median_interval, na.rm = TRUE)
  )
}

# ---- Combine all results ----
results <- bind_rows(
  cohort_summary(ctDNA_patient_medians, "ctDNA"),
  cohort_summary(imaging_patient_medians, "Imaging")
)

# ---- Print cohort-level summary ----
cat("===== Median Frequency (Days) per Cohort =====\n")
print(results, row.names = FALSE)

# ---- OPTIONAL: Save patient-level medians ----
patient_level_medians <- bind_rows(ctDNA_patient_medians, imaging_patient_medians)
#write.csv(patient_level_medians, "patient_median_intervals.csv", row.names = FALSE)
```

#Plot for individual lead-time calculations for each pt
```{r}
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_Lead Time pts.csv"
df <- read.csv(csv_path, check.names = FALSE)
if ("Final.cohort" %in% names(df)) {
  df <- df[df$Final.cohort == TRUE, ]
} else {
  warning("Column 'Final.cohort' not found in the dataset.")
}
id_candidates <- c("Patient","ID","Pt","Subject","Sample")
id_col <- id_candidates[id_candidates %in% names(df)][1]
if (is.na(id_col)) {
  df <- df %>% mutate(Patient = row_number())
  id_col <- "Patient"
}

num_cols <- intersect(c("MR","CR","LT"), names(df))
df[num_cols] <- lapply(df[num_cols], function(x) suppressWarnings(as.numeric(x)))

# If LT missing, compute in days
if (!("LT" %in% names(df))) {
  df <- df %>% mutate(LT = CR - MR)
}

# ---- convert to months ----
# Approximate: 30.44 days per month (average)
days_to_months <- function(x) x / 30.437

df <- df %>%
  mutate(MR_mo = days_to_months(MR),
         CR_mo = days_to_months(CR),
         LT_mo = days_to_months(LT))

# ---- ordering for y-axis ----
df <- df %>%
  mutate(Earliest = pmin(MR_mo, CR_mo, na.rm = TRUE)) %>%
  arrange(Earliest) %>%
  mutate(Patient_f = factor(.data[[id_col]], levels = .data[[id_col]]))

# ---- long format for points ----
points_long <- df %>%
  select(Patient_f, MR_mo, CR_mo) %>%
  pivot_longer(c(MR_mo, CR_mo), names_to = "Type", values_to = "Months") %>%
  mutate(Type = dplyr::recode(Type,
                              "MR_mo" = "Molecular recurrence",
                              "CR_mo" = "Clinical recurrence"))

# ---- segments between MR and CR ----
segments_df <- df %>%
  transmute(Patient_f,
            x0 = MR_mo, x1 = CR_mo,
            lt_flag = if_else(LT > 120, "gt120", "le120"))

# ---- annotation ----
med_lt  <- median(df$LT_mo, na.rm = TRUE)
min_lt  <- min(df$LT_mo, na.rm = TRUE)
max_lt  <- max(df$LT_mo, na.rm = TRUE)

annot_label <- sprintf("Median lead-time:\n%.1f months (%.1f to %.1f)",
                       med_lt, min_lt, max_lt)

# ---- plot ----
pal <- c("Molecular recurrence" = "#10B4C1",
         "Clinical recurrence"  = "#C96A72")

x_max <- max(c(df$MR_mo, df$CR_mo), na.rm = TRUE)
y_mid <- levels(df$Patient_f)[ceiling(nlevels(df$Patient_f) * 0.55)]

p <- ggplot() +
  geom_segment(data = segments_df,
               aes(x = x0, xend = x1, y = Patient_f, yend = Patient_f,
                   linetype = lt_flag),
               linewidth = 0.6, color = "black") +
  scale_linetype_manual(values = c(le120 = "solid", gt120 = "dashed"),
                        guide = "none") +
  geom_point(data = points_long,
             aes(x = Months, y = Patient_f, color = Type),
             size = 2.8) +
  scale_color_manual(values = pal, name = NULL) +
  labs(x = "Months from Surgery or Definitive Treatment", y = NULL) +
  annotate("text",
           x = x_max * 0.73,
           y = y_mid,
           label = annot_label,
           hjust = 0, vjust = 0.5, size = 3.8) +
  scale_x_continuous(breaks = seq(0, ceiling(x_max/3)*3, by = 3)) +
  coord_cartesian(xlim = c(0, x_max * 1.05)) +
  theme_bw(base_size = 12) +
  theme(
    panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
    panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
    axis.text.y = element_text(size = 9)
  )

print(p)

wc_df <- df %>% dplyr::select(MR, CR) %>% tidyr::drop_na()
paired_diff <- wc_df$CR - wc_df$MR

wilx <- wilcox.test(
  x = wc_df$CR, y = wc_df$MR,
  paired = TRUE,
  alternative = "two.sided",
  conf.int = TRUE,      # gives CI for the Hodges–Lehmann estimate of the median difference
  exact = FALSE         # safer for ties/large N
)

W_stat   <- unname(wilx$statistic)          # Wilcoxon V
p_value  <- wilx$p.value
HL_est   <- unname(wilx$estimate)           # median of (CR - MR)
HL_ci    <- wilx$conf.int                   # CI for median difference

# Simple p-value formatter for annotation/publication
fmt_p <- function(p) {
  if (is.na(p)) return("NA")
  if (p < 0.001) "< 0.001" else sprintf("= %.3f", round(p, 3))
}
p_text <- paste0("P ", fmt_p(p_value))  # e.g., "P = 0.003" or "P < 0.001"

# Optional: print a compact summary
cat("\nPaired Wilcoxon signed-rank test (CR vs MR)\n",
    "-------------------------------------------\n",
    sprintf("N pairs: %d", nrow(wc_df)), "\n",
    sprintf("W (V): %g", W_stat), "\n",
    sprintf("P-value: %s", ifelse(p_value < 0.001, "< 0.001", sprintf("%.6f", p_value))), "\n",
    sprintf("Hodges–Lehmann median difference (CR - MR): %.1f days", HL_est), "\n",
    sprintf("95%% CI: [%.1f, %.1f] days", HL_ci[1], HL_ci[2]), "\n", sep = "")
```


#Multivariate cox regression at Surveillance Window for RFS
```{r}
rm(list=ls())
setwd("~/Downloads") 
circ_data <- read.csv("RWE UTUC_Clinical Data 102025.csv")
circ_data <- circ_data[circ_data$Final.cohort=="TRUE",]
circ_data <- circ_data[circ_data$ctDNA.Surveillance!="",]

circ_data$ctDNA.Surveillance <- factor(circ_data$ctDNA.Surveillance, levels=c("NEGATIVE","POSITIVE"), labels = c("Negative", "Positive"))
circ_data$cStage <- factor(circ_data$cStage, levels = c("0/I", "II/III"))
circ_data$NAC <- factor(circ_data$NAC, levels = c("FALSE", "TRUE"))
circ_data$ACT <- factor(circ_data$ACT, levels = c("FALSE", "TRUE"))
surv_object <- Surv(time = circ_data$RFS.months, event = circ_data$RFS.Event) 
cox_fit <- coxph(surv_object ~ ctDNA.Surveillance + Age + cStage + NAC + ACT, data=circ_data) 
ggforest(cox_fit, data = circ_data, main = "Multivariate Regression Model for RFS - All Stages", refLabel = "Reference Group")
test.ph <- cox.zph(cox_fit)
```

#ctDNA and MTM/mL Dynamics for pts at surveillance window
```{r}
#Dynamics and MTM/mL plots for patients with ctDNA negative at surveillance
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("CLIA UTUC ctDNA MTM.csv", stringsAsFactors = FALSE)
df <- df[df$Final.cohort=="TRUE",]
df <- df[df$ctDNA.Surveillance=="NEGATIVE",]

df$RFS.Event <- ifelse(df$RFS.Event %in% c("No", "no", "FALSE", "False", "0"), FALSE,
                       ifelse(df$RFS.Event %in% c("Yes", "yes", "TRUE", "True", "1"), TRUE, NA))
df$RFS.Event <- factor(df$RFS.Event, levels = c(FALSE, TRUE))
df <- df %>%
  group_by(PatientName) %>%
  filter(n() >= 2) %>% #keep only pts with at least 2 post-surgery time points
  ungroup()

num_unique <- length(unique(df$PatientName))
cat("Number of unique patients:", num_unique, "\n")

df_patient_pfs <- df %>%
  group_by(PatientName) %>%
  dplyr::summarize(
    PFS_True = any(RFS.Event == TRUE, na.rm = TRUE),
    PFS_False = all(RFS.Event == FALSE, na.rm = TRUE)
  )

num_true <- sum(df_patient_pfs$PFS_True)
num_false <- sum(df_patient_pfs$PFS_False)

cat("Number of unique patients with Event:", num_true, "\n")
cat("Number of unique patients with No Event:", num_false, "\n")

p <- ggplot(df, aes(x = date.diff.months, 
                    y = MTM.mL, 
                    group = PatientName, 
                    color = RFS.Event)) +
  geom_line() +      # Connect timepoints for each patient
  geom_point() +     # Add points for each timepoint
  # Use a log10 scale for the y-axis with specified breaks
  scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100, 1000),
                labels = c("0.01","0.1", "1", "10", "100", "1000")) +
  scale_x_continuous(breaks = seq(0, max(df$date.diff.months, na.rm = TRUE), by = 6)) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
  labs(
    x = "Time Since Surgery or start of definitive treatment (months)",
    y = "Mean Tumor Molecules per mL (MTM/mL)",
    color = "RFS Event"
  ) +
  theme_minimal()
print(p)

#Dynamics and MTM/mL plots for patients with ctDNA positive at surveillance
rm(list=ls())
setwd("~/Downloads")
df <- read.csv("CLIA UTUC ctDNA MTM.csv", stringsAsFactors = FALSE)
df <- df[df$Final.cohort=="TRUE",]
df <- df[df$ctDNA.Surveillance=="POSITIVE",]

df$RFS.Event <- ifelse(df$RFS.Event %in% c("No", "no", "FALSE", "False", "0"), FALSE,
                       ifelse(df$RFS.Event %in% c("Yes", "yes", "TRUE", "True", "1"), TRUE, NA))
df$RFS.Event <- factor(df$RFS.Event, levels = c(FALSE, TRUE))
df <- df %>%
  group_by(PatientName) %>%
  filter(n() >= 2) %>% #keep only pts with at least 2 post-surgery time points
  ungroup()

num_unique <- length(unique(df$PatientName))
cat("Number of unique patients:", num_unique, "\n")

df_patient_pfs <- df %>%
  group_by(PatientName) %>%
  dplyr::summarize(
    PFS_True = any(RFS.Event == TRUE, na.rm = TRUE),
    PFS_False = all(RFS.Event == FALSE, na.rm = TRUE)
  )

num_true <- sum(df_patient_pfs$PFS_True)
num_false <- sum(df_patient_pfs$PFS_False)

cat("Number of unique patients with Event:", num_true, "\n")
cat("Number of unique patients with No Event:", num_false, "\n")

p <- ggplot(df, aes(x = date.diff.months, 
                    y = MTM.mL, 
                    group = PatientName, 
                    color = RFS.Event)) +
  geom_line() +      # Connect timepoints for each patient
  geom_point() +     # Add points for each timepoint
  # Use a log10 scale for the y-axis with specified breaks
  scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100, 1000),
                labels = c("0.01","0.1", "1", "10", "100", "1000")) +
  scale_x_continuous(breaks = seq(0, max(df$date.diff.months, na.rm = TRUE), by = 6)) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
  labs(
    x = "Time Since Surgery or start of definitive treatment (months)",
    y = "Mean Tumor Molecules per mL (MTM/mL)",
    color = "RFS Event"
  ) +
  theme_minimal()
print(p)
```

#ctDNA velocity and lead time liner regression
```{r}
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_ctDNA velocity.csv"  # <- adjust if needed
df_raw <- read.csv(csv_path, check.names = FALSE)
df <- df_raw %>%
  rename(MTM_mL = `MTM.mL`) %>%
  mutate(
    daysCR.months = as.numeric(daysCR.months),
    MTM_mL = as.numeric(MTM_mL)
  ) %>%
  filter(Final.cohort == TRUE, !is.na(PatientName), !is.na(daysCR.months), !is.na(MTM_mL))

preCR <- df %>%
  filter(daysCR.months <= 0, is.finite(MTM_mL), MTM_mL > 0)

eligible <- preCR %>%
  group_by(PatientName) %>%
  filter(n() >= 2, n_distinct(daysCR.months) >= 2) %>%
  ungroup()
if (nrow(eligible) == 0) {
  warning("No patients have ≥2 valid pre-recurrence points with distinct times; regression lines will be omitted.")
}

fits <- eligible %>%
  group_by(PatientName) %>%
  summarise(x_min = min(daysCR.months, na.rm = TRUE), .groups = "drop") %>%
  mutate(grid = map(x_min, ~seq(.x, 0, length.out = 50))) %>%
  select(PatientName, grid)

predict_patient <- function(dat, newx) {
  if (length(newx) == 0) {
    return(tibble(daysCR.months = numeric(0), MTM_mL = numeric(0)))
  }
  dat2 <- dat %>%
    filter(is.finite(MTM_mL), MTM_mL > 0) %>%
    mutate(log_ctdna = log10(MTM_mL))
  if (nrow(dat2) < 2 || n_distinct(dat2$daysCR.months) < 2 || any(!is.finite(dat2$log_ctdna))) {
    return(tibble(daysCR.months = numeric(0), MTM_mL = numeric(0)))
  }
  m <- lm(log_ctdna ~ daysCR.months, data = dat2)
  tibble(
    daysCR.months = newx,
    MTM_mL = 10 ^ predict(m, newdata = tibble(daysCR.months = newx))
  )
}

pred_lines <- eligible %>%
  group_by(PatientName) %>%
  tidyr::nest() %>%                       # list-column "data" per patient
  left_join(fits, by = "PatientName") %>% # list-column "grid" per patient
  mutate(pred = map2(data, grid, ~predict_patient(.x, .y))) %>%
  select(PatientName, pred) %>%
  tidyr::unnest(pred)

pooled_line <- {
  if (nrow(preCR) >= 2 && n_distinct(preCR$daysCR.months) >= 2) {
    pooled_x <- seq(min(preCR$daysCR.months, na.rm = TRUE), 0, length.out = 100)
    pooled_fit <- lm(log10(MTM_mL) ~ daysCR.months, data = preCR)
    tibble(
      daysCR.months = pooled_x,
      MTM_mL = 10 ^ predict(pooled_fit, newdata = tibble(daysCR.months = pooled_x))
    )
  } else {
    tibble(daysCR.months = numeric(0), MTM_mL = numeric(0))
  }
}

x_min <- floor(min(df$daysCR.months, na.rm = TRUE) / 3) * 3
x_max <- ceiling(max(df$daysCR.months, na.rm = TRUE) / 3) * 3
y_min_pos <- max(min(df$MTM_mL[df$MTM_mL > 0], na.rm = TRUE) / 2, 0.01)
y_max_pos <- 10 ^ ceiling(log10(max(df$MTM_mL, na.rm = TRUE)))
log_breaks <- 10 ^ seq(floor(log10(y_min_pos)), ceiling(log10(y_max_pos)))

p <- ggplot() +
  # per-patient fitted lines (if any)
  geom_line(data = pred_lines,
            aes(x = daysCR.months, y = MTM_mL, color = PatientName),
            linewidth = 1) +
  # pooled dashed trend (if any)
  geom_line(data = pooled_line,
            aes(x = daysCR.months, y = MTM_mL),
            linewidth = 0.8, linetype = "dashed", color = "grey35") +
  # raw points (all timepoints, before and after)
  geom_point(data = df,
             aes(x = daysCR.months, y = MTM_mL, color = PatientName),
             size = 1.8, alpha = 0.85) +
  # vertical line at imaging-positive date
  geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.8, color = "black") +
  scale_y_log10(breaks = log_breaks,
                labels = label_number(accuracy = 1)) +
  scale_x_continuous(breaks = seq(x_min, x_max, by = 3)) +  # every 3 months
  labs(
    x = "Months before/after clinical recurrence (0 = imaging positive)",
    y = "ctDNA level (MTM/mL)",
    color = "Patient"
  ) +
  theme_bw(base_size = 12) +
  theme(
    panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
    panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
    legend.position = "none"  # change to "right" if you want the legend
  )

print(p)
```


#Time from first ctDNA positive timepoint to last imaging in surveillance_False Positive patients
```{r}
rm(list=ls())
csv_path <- "~/Downloads/CLIA UTUC_Time from ctDNA to imaging FP pts.csv"
df <- read.csv(csv_path, check.names = FALSE)
if ("Final.cohort" %in% names(df)) {
  df <- df[df$Final.cohort == TRUE, ]
} else {
  warning("Column 'Final.cohort' not found in the dataset.")
}
id_candidates <- c("Patient","ID","Pt","Subject","Sample")
id_col <- id_candidates[id_candidates %in% names(df)][1]
if (is.na(id_col)) {
  df <- df %>% mutate(Patient = row_number())
  id_col <- "Patient"
}

num_cols <- intersect(c("MR","CR","LT"), names(df))
df[num_cols] <- lapply(df[num_cols], function(x) suppressWarnings(as.numeric(x)))

# If LT missing, compute in days
if (!("LT" %in% names(df))) {
  df <- df %>% mutate(LT = CR - MR)
}

# ---- convert to months ----
# Approximate: 30.44 days per month (average)
days_to_months <- function(x) x / 30.437

df <- df %>%
  mutate(MR_mo = days_to_months(MR),
         CR_mo = days_to_months(CR),
         LT_mo = days_to_months(LT))

# ---- ordering for y-axis ----
df <- df %>%
  mutate(Earliest = pmin(MR_mo, CR_mo, na.rm = TRUE)) %>%
  arrange(Earliest) %>%
  mutate(Patient_f = factor(.data[[id_col]], levels = .data[[id_col]]))

# ---- long format for points ----
points_long <- df %>%
  select(Patient_f, MR_mo, CR_mo) %>%
  pivot_longer(c(MR_mo, CR_mo), names_to = "Type", values_to = "Months") %>%
  mutate(Type = dplyr::recode(Type,
                              "MR_mo" = "First ctDNA positive",
                              "CR_mo" = "Last Radiological assessment"))

# ---- segments between MR and CR ----
segments_df <- df %>%
  transmute(Patient_f,
            x0 = MR_mo, x1 = CR_mo,
            lt_flag = if_else(LT > 120, "gt120", "le120"))

# ---- annotation ----
med_lt  <- median(df$LT_mo, na.rm = TRUE)
min_lt  <- min(df$LT_mo, na.rm = TRUE)
max_lt  <- max(df$LT_mo, na.rm = TRUE)

annot_label <- sprintf("Median time from ctDNA positive to last imaging:\n%.1f months (%.1f to %.1f)",
                       med_lt, min_lt, max_lt)

# ---- plot ----
pal <- c("First ctDNA positive" = "black",
         "Last Radiological assessment"  = "red")

x_max <- max(c(df$MR_mo, df$CR_mo), na.rm = TRUE)
y_mid <- levels(df$Patient_f)[ceiling(nlevels(df$Patient_f) * 0.55)]

p <- ggplot() +
  geom_segment(data = segments_df,
               aes(x = x0, xend = x1, y = Patient_f, yend = Patient_f,
                   linetype = lt_flag),
               linewidth = 0.6, color = "black") +
  scale_linetype_manual(values = c(le120 = "solid", gt120 = "dashed"),
                        guide = "none") +
  geom_point(data = points_long,
             aes(x = Months, y = Patient_f, color = Type),
             size = 2.8) +
  scale_color_manual(values = pal, name = NULL) +
  labs(x = "Months from ctDNA positivity", y = NULL) +
  annotate("text",
           x = x_max * 0.73,
           y = y_mid,
           label = annot_label,
           hjust = 0, vjust = 0.5, size = 3.8) +
  scale_x_continuous(breaks = seq(0, ceiling(x_max/3)*3, by = 3)) +
  coord_cartesian(xlim = c(0, x_max * 1.05)) +
  theme_bw(base_size = 12) +
  theme(
    panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
    panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
    axis.text.y = element_text(size = 9)
  )

print(p)

wc_df <- df %>% dplyr::select(MR, CR) %>% tidyr::drop_na()
paired_diff <- wc_df$CR - wc_df$MR

wilx <- wilcox.test(
  x = wc_df$CR, y = wc_df$MR,
  paired = TRUE,
  alternative = "two.sided",
  conf.int = TRUE,      # gives CI for the Hodges–Lehmann estimate of the median difference
  exact = FALSE         # safer for ties/large N
)

W_stat   <- unname(wilx$statistic)          # Wilcoxon V
p_value  <- wilx$p.value
HL_est   <- unname(wilx$estimate)           # median of (CR - MR)
HL_ci    <- wilx$conf.int                   # CI for median difference

# Simple p-value formatter for annotation/publication
fmt_p <- function(p) {
  if (is.na(p)) return("NA")
  if (p < 0.001) "< 0.001" else sprintf("= %.3f", round(p, 3))
}
p_text <- paste0("P ", fmt_p(p_value))  # e.g., "P = 0.003" or "P < 0.001"

# Optional: print a compact summary
cat("\nPaired Wilcoxon signed-rank test (CR vs MR)\n",
    "-------------------------------------------\n",
    sprintf("N pairs: %d", nrow(wc_df)), "\n",
    sprintf("W (V): %g", W_stat), "\n",
    sprintf("P-value: %s", ifelse(p_value < 0.001, "< 0.001", sprintf("%.6f", p_value))), "\n",
    sprintf("Hodges–Lehmann median difference (CR - MR): %.1f days", HL_est), "\n",
    sprintf("95%% CI: [%.1f, %.1f] days", HL_ci[1], HL_ci[2]), "\n", sep = "")
```


#Individual Pt Plots_False Positive patients
```{r}
#Filter for Patient UTUC-010
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Pt Specific Plot data.csv")
plot_data <- circ_data %>% 
  filter(PatientName == "UTUC-010") 

# 2. Process layers
# Handle ctDNA log-scale pseudo-zero
data_Signatera <- plot_data %>% 
  filter(Event_type %in% c("ctDNA_pos", "ctDNA_neg")) %>%
  mutate(MTM_Log = ifelse(MTM.mL == 0, 0.001, MTM.mL))

data_Imaging <- plot_data %>% 
  filter(Event_type == "Imaging")

# Treatment blocks
data_Treatment <- plot_data %>% 
  filter(!is.na(Tx_type) & Tx_type != "NA" & !is.na(Tx_start.months)) %>%
  distinct(Tx_type, Tx_start.months, Tx_end.months)

# 3. Create the Plot
ggplot() +
  # Treatment Rectangles
  geom_rect(data = data_Treatment, 
            aes(xmin = Tx_start.months, xmax = Tx_end.months, ymin = 0.004, ymax = 10), 
            fill = "lightblue", alpha = 0.15) +
  geom_text(data = data_Treatment, 
            aes(x = (Tx_start.months + Tx_end.months)/2, y = 5, label = Tx_type), 
            angle = 90, color = "black", size = 3, fontface = "bold") +
  
  # ctDNA Line
  geom_line(data = data_Signatera, 
            aes(x = date.diff.months, y = MTM_Log), color = "black", linewidth = 0.7) +
  
  # ctDNA Points (Mapped to Fill)
  geom_point(data = data_Signatera, 
             aes(x = date.diff.months, y = MTM_Log, fill = Event_type), 
             shape = 21, size = 3.5, color = "black") +
  
  # Imaging Points (Mapped to Shape and Color to create a second legend)
  geom_point(data = data_Imaging, 
             aes(x = date.diff.months, y = 0.005, color = "NED in Scan"), 
             shape = 25, size = 4, fill = "darkgreen") +
  
  # X-Axis: 3-month intervals
  scale_x_continuous(breaks = seq(0, max(plot_data$date.diff.months, na.rm=T) + 3, by = 3)) +
  
  # Y-Axis: Log10 scale
  scale_y_log10(breaks = c(0.001, 0.01, 0.1, 1, 10),
                labels = c("0", "0.01", "0.1", "1", "10"),
                limits = c(0.001, 15)) +
  
  # Legend for ctDNA
  scale_fill_manual(values = c("ctDNA_pos" = "black", "ctDNA_neg" = "white"), 
                    labels = c("Negative", "Positive"), 
                    name = "ctDNA Status") +
  
  # Legend for Imaging
  scale_color_manual(values = c("NED in Scan" = "darkgreen"), 
                     name = "Imaging Result") +
  
  # Styling the legend to show the triangle correctly
  guides(color = guide_legend(override.aes = list(shape = 25, fill = "darkgreen"))) +
  
  labs(x = "Months from Surgery", 
       y = "MTM/mL", 
       title = "Clinical Timeline: UTUC-010") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        legend.position = "right")

#Filter for Patient UTUC-011
rm(list=ls())
setwd("~/Downloads")
circ_data <- read.csv("RWE UTUC_Pt Specific Plot data.csv")
plot_data <- circ_data %>% 
  filter(PatientName == "UTUC-011") 

# 2. Process layers
# Handle ctDNA log-scale pseudo-zero
data_Signatera <- plot_data %>% 
  filter(Event_type %in% c("ctDNA_pos", "ctDNA_neg")) %>%
  mutate(MTM_Log = ifelse(MTM.mL == 0, 0.001, MTM.mL))

data_Imaging <- plot_data %>% 
  filter(Event_type == "Imaging")

# Treatment blocks
data_Treatment <- plot_data %>% 
  filter(!is.na(Tx_type) & Tx_type != "NA" & !is.na(Tx_start.months)) %>%
  distinct(Tx_type, Tx_start.months, Tx_end.months)

# 3. Create the Plot
ggplot() +
  # Treatment Rectangles
  geom_rect(data = data_Treatment, 
            aes(xmin = Tx_start.months, xmax = Tx_end.months, ymin = 0.004, ymax = 10), 
            fill = "lightblue", alpha = 0.15) +
  geom_text(data = data_Treatment, 
            aes(x = (Tx_start.months + Tx_end.months)/2, y = 5, label = Tx_type), 
            angle = 90, color = "black", size = 3, fontface = "bold") +
  
  # ctDNA Line
  geom_line(data = data_Signatera, 
            aes(x = date.diff.months, y = MTM_Log), color = "black", linewidth = 0.7) +
  
  # ctDNA Points (Mapped to Fill)
  geom_point(data = data_Signatera, 
             aes(x = date.diff.months, y = MTM_Log, fill = Event_type), 
             shape = 21, size = 3.5, color = "black") +
  
  # Imaging Points (Mapped to Shape and Color to create a second legend)
  geom_point(data = data_Imaging, 
             aes(x = date.diff.months, y = 0.005, color = "NED in Scan"), 
             shape = 25, size = 4, fill = "darkgreen") +
  
  # X-Axis: 3-month intervals
  scale_x_continuous(breaks = seq(0, max(plot_data$date.diff.months, na.rm=T) + 3, by = 3)) +
  
  # Y-Axis: Log10 scale
  scale_y_log10(breaks = c(0.001, 0.01, 0.1, 1, 10),
                labels = c("0", "0.01", "0.1", "1", "10"),
                limits = c(0.001, 15)) +
  
  # Legend for ctDNA
  scale_fill_manual(values = c("ctDNA_pos" = "black", "ctDNA_neg" = "white"), 
                    labels = c("Negative", "Positive"), 
                    name = "ctDNA Status") +
  
  # Legend for Imaging
  scale_color_manual(values = c("NED in Scan" = "darkgreen"), 
                     name = "Imaging Result") +
  
  # Styling the legend to show the triangle correctly
  guides(color = guide_legend(override.aes = list(shape = 25, fill = "darkgreen"))) +
  
  labs(x = "Months from Surgery", 
       y = "MTM/mL", 
       title = "Clinical Timeline: UTUC-011") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        legend.position = "right")
```

